dvt-core 0.59.0a51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2660 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +60 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.py +273 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.py +1252 -0
- dbt/compute/metadata/__init__.py +63 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/catalog_store.py +1036 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.py +1020 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/spark_logger.py +272 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +472 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.py +408 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +888 -0
- dbt/config/project_utils.py +48 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +564 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +419 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_comprehensive_registry.py +1254 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/dvt_starter_project/README.md +15 -0
- dbt/include/dvt_starter_project/__init__.py +3 -0
- dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/dvt_project.yml +39 -0
- dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
- dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +122 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2208 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +506 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +513 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +1002 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +509 -0
- dbt/task/dvt_run.py +282 -0
- dbt/task/dvt_seed.py +806 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +1022 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.py +804 -0
- dbt/task/migrate.py +714 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.py +1489 -0
- dbt/task/profile_serve.py +662 -0
- dbt/task/retract.py +441 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1647 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +814 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +271 -0
- dvt_cli/__init__.py +158 -0
- dvt_core-0.59.0a51.dist-info/METADATA +288 -0
- dvt_core-0.59.0a51.dist-info/RECORD +299 -0
- dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
- dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
- dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Standalone Spark Cluster Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides connection to self-managed Spark clusters (on-premises or cloud VMs).
|
|
5
|
+
|
|
6
|
+
v0.5.98: New strategy for standalone Spark clusters with Maven-based JAR provisioning.
|
|
7
|
+
Fixes the bug where external clusters incorrectly fell back to LocalStrategy
|
|
8
|
+
with local JAR paths that don't exist on remote workers.
|
|
9
|
+
|
|
10
|
+
Configuration:
|
|
11
|
+
{
|
|
12
|
+
"master": "spark://master-node:7077", # Required: Spark master URL
|
|
13
|
+
"spark.driver.memory": "4g", # Optional: driver memory
|
|
14
|
+
"spark.executor.memory": "8g", # Optional: executor memory
|
|
15
|
+
"spark.executor.cores": "4", # Optional: cores per executor
|
|
16
|
+
"spark.executor.instances": "10", # Optional: number of executors
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
Requirements:
|
|
20
|
+
- Standalone Spark cluster must be running
|
|
21
|
+
- Spark master must be accessible from client machine
|
|
22
|
+
- Workers must have network access to Maven Central (for JAR downloads)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
26
|
+
|
|
27
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
28
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from pyspark.sql import SparkSession
|
|
32
|
+
|
|
33
|
+
PYSPARK_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
PYSPARK_AVAILABLE = False
|
|
36
|
+
SparkSession = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StandaloneStrategy(BaseConnectionStrategy):
|
|
40
|
+
"""
|
|
41
|
+
Standalone Spark cluster connection strategy.
|
|
42
|
+
|
|
43
|
+
Connects to self-managed Spark clusters using spark:// master URL.
|
|
44
|
+
Uses spark.jars.packages for JDBC JAR provisioning so workers can
|
|
45
|
+
download drivers from Maven Central.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def validate_config(self) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Validate Standalone strategy configuration.
|
|
51
|
+
|
|
52
|
+
Required:
|
|
53
|
+
- master: Must start with "spark://" for standalone clusters
|
|
54
|
+
|
|
55
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
56
|
+
"""
|
|
57
|
+
if not isinstance(self.config, dict):
|
|
58
|
+
raise DbtRuntimeError(
|
|
59
|
+
f"Standalone config must be a dictionary, got {type(self.config)}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Check master format
|
|
63
|
+
master = self.config.get("master", "")
|
|
64
|
+
if not master.startswith("spark://"):
|
|
65
|
+
raise DbtRuntimeError(
|
|
66
|
+
f"Standalone config requires master to start with 'spark://', got: {master}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
70
|
+
"""
|
|
71
|
+
Create Spark session connected to standalone cluster.
|
|
72
|
+
|
|
73
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
74
|
+
:returns: Initialized SparkSession connected to standalone cluster
|
|
75
|
+
:raises DbtRuntimeError: If session creation fails
|
|
76
|
+
"""
|
|
77
|
+
if not PYSPARK_AVAILABLE:
|
|
78
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# v0.51.0: Ensure Java is available
|
|
82
|
+
from dbt.compute.strategies.local import _ensure_java_available
|
|
83
|
+
_ensure_java_available()
|
|
84
|
+
|
|
85
|
+
# v0.51.0: Stop any existing session to ensure fresh config
|
|
86
|
+
existing = SparkSession.getActiveSession()
|
|
87
|
+
if existing:
|
|
88
|
+
existing.stop()
|
|
89
|
+
|
|
90
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
91
|
+
|
|
92
|
+
# Set master URL
|
|
93
|
+
master = self.config.get("master")
|
|
94
|
+
builder = builder.master(master)
|
|
95
|
+
|
|
96
|
+
# v0.5.99: Get JDBC JAR config (Maven coordinates for remote workers)
|
|
97
|
+
# Merge with user-provided spark.jars.packages instead of overwriting
|
|
98
|
+
if adapter_types is None:
|
|
99
|
+
from dbt.compute.jar_provisioning import get_required_adapter_types
|
|
100
|
+
adapter_types = get_required_adapter_types()
|
|
101
|
+
|
|
102
|
+
auto_packages = []
|
|
103
|
+
if adapter_types:
|
|
104
|
+
jar_config = self.get_jar_provisioning_config(adapter_types)
|
|
105
|
+
auto_packages_str = jar_config.get("spark.jars.packages", "")
|
|
106
|
+
if auto_packages_str:
|
|
107
|
+
auto_packages = [p.strip() for p in auto_packages_str.split(",") if p.strip()]
|
|
108
|
+
|
|
109
|
+
# Get user-provided packages from config
|
|
110
|
+
user_packages_str = self.config.get("spark.jars.packages", "")
|
|
111
|
+
user_packages = [p.strip() for p in user_packages_str.split(",") if p.strip()]
|
|
112
|
+
|
|
113
|
+
# Merge packages (user + auto-detected)
|
|
114
|
+
all_packages = list(set(user_packages + auto_packages))
|
|
115
|
+
if all_packages:
|
|
116
|
+
builder = builder.config("spark.jars.packages", ",".join(all_packages))
|
|
117
|
+
|
|
118
|
+
# Apply user-provided configs (except spark.jars.packages which we merged)
|
|
119
|
+
for key, value in self.config.items():
|
|
120
|
+
if key != "master" and key != "spark.jars.packages":
|
|
121
|
+
builder = builder.config(key, value)
|
|
122
|
+
|
|
123
|
+
# Default optimizations
|
|
124
|
+
default_configs = {
|
|
125
|
+
"spark.sql.execution.arrow.pyspark.enabled": "true",
|
|
126
|
+
"spark.sql.execution.arrow.pyspark.fallback.enabled": "true",
|
|
127
|
+
}
|
|
128
|
+
for key, value in default_configs.items():
|
|
129
|
+
if key not in self.config:
|
|
130
|
+
builder = builder.config(key, value)
|
|
131
|
+
|
|
132
|
+
# DVT v0.51.5: Auto-configure driver host for Docker Spark clusters
|
|
133
|
+
# When master is on localhost, workers (in Docker containers) need to reach
|
|
134
|
+
# the driver running on the host machine via host.docker.internal
|
|
135
|
+
if "spark.driver.host" not in self.config:
|
|
136
|
+
if "localhost" in master or "127.0.0.1" in master:
|
|
137
|
+
builder = builder.config("spark.driver.host", "host.docker.internal")
|
|
138
|
+
|
|
139
|
+
# Create session
|
|
140
|
+
spark = builder.getOrCreate()
|
|
141
|
+
spark.sparkContext.setLogLevel("WARN")
|
|
142
|
+
|
|
143
|
+
return spark
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
error_msg = str(e)
|
|
147
|
+
master = self.config.get("master", "unknown")
|
|
148
|
+
if "Connection refused" in error_msg:
|
|
149
|
+
raise DbtRuntimeError(
|
|
150
|
+
f"Cannot connect to Spark master at '{master}'. "
|
|
151
|
+
f"Ensure the cluster is running and accessible. Error: {error_msg}"
|
|
152
|
+
) from e
|
|
153
|
+
raise DbtRuntimeError(f"Failed to create Standalone Spark session: {error_msg}") from e
|
|
154
|
+
|
|
155
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
156
|
+
"""
|
|
157
|
+
Clean up Spark session.
|
|
158
|
+
|
|
159
|
+
For standalone clusters, we stop the application but the cluster continues running.
|
|
160
|
+
|
|
161
|
+
:param spark: SparkSession to clean up
|
|
162
|
+
"""
|
|
163
|
+
if spark:
|
|
164
|
+
try:
|
|
165
|
+
spark.stop()
|
|
166
|
+
except Exception:
|
|
167
|
+
pass # Best effort cleanup
|
|
168
|
+
|
|
169
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
170
|
+
"""
|
|
171
|
+
Estimate cost for standalone cluster execution.
|
|
172
|
+
|
|
173
|
+
For self-managed clusters, returns 0.0 as cost depends on infrastructure.
|
|
174
|
+
|
|
175
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
176
|
+
:returns: 0.0 (infrastructure cost varies)
|
|
177
|
+
"""
|
|
178
|
+
# Self-managed clusters have variable cost based on infrastructure
|
|
179
|
+
return 0.0
|
|
180
|
+
|
|
181
|
+
def get_platform_name(self) -> str:
|
|
182
|
+
"""Get platform name."""
|
|
183
|
+
return "standalone"
|
|
184
|
+
|
|
185
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
186
|
+
"""
|
|
187
|
+
Get Spark config for JDBC JAR provisioning using Maven coordinates.
|
|
188
|
+
|
|
189
|
+
Standalone clusters need spark.jars.packages so workers can download
|
|
190
|
+
JDBC drivers from Maven Central. Local file paths don't work because
|
|
191
|
+
they're not available on remote worker nodes.
|
|
192
|
+
|
|
193
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
194
|
+
:returns: Dictionary with spark.jars.packages config
|
|
195
|
+
"""
|
|
196
|
+
from dbt.compute.jar_provisioning import RemoteJARProvisioning
|
|
197
|
+
|
|
198
|
+
provisioning = RemoteJARProvisioning()
|
|
199
|
+
return provisioning.get_spark_config(adapter_types)
|
|
200
|
+
|
|
201
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
202
|
+
"""
|
|
203
|
+
Test connectivity to standalone Spark cluster.
|
|
204
|
+
|
|
205
|
+
v0.51.1: Added timeout to prevent hanging when workers unavailable.
|
|
206
|
+
v0.51.8: Increased timeout to 90s for Docker clusters (JDBC JAR download time).
|
|
207
|
+
|
|
208
|
+
:returns: Tuple of (success, message)
|
|
209
|
+
"""
|
|
210
|
+
if not PYSPARK_AVAILABLE:
|
|
211
|
+
return (False, "PySpark not installed")
|
|
212
|
+
|
|
213
|
+
import concurrent.futures
|
|
214
|
+
|
|
215
|
+
master = self.config.get("master", "unknown")
|
|
216
|
+
|
|
217
|
+
def _run_test():
|
|
218
|
+
spark = self.get_spark_session()
|
|
219
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
# Use ThreadPoolExecutor with timeout to prevent hanging
|
|
224
|
+
# when workers aren't available
|
|
225
|
+
# v0.51.8: Increased from 30s to 90s - Docker Spark clusters need time
|
|
226
|
+
# for JDBC JAR downloads from Maven on first run
|
|
227
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
228
|
+
future = executor.submit(_run_test)
|
|
229
|
+
try:
|
|
230
|
+
result = future.result(timeout=90) # 90 second timeout for JAR downloads
|
|
231
|
+
return (True, "Standalone cluster session created and SQL test passed")
|
|
232
|
+
except concurrent.futures.TimeoutError:
|
|
233
|
+
return (False,
|
|
234
|
+
f"Timeout (90s): Workers not responding at '{master}'.\n"
|
|
235
|
+
f"Check: cluster workers are running, network access from driver to workers.\n"
|
|
236
|
+
f"Note: First run may take longer due to JDBC JAR downloads."
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
except Exception as e:
|
|
240
|
+
error_msg = str(e)
|
|
241
|
+
if "Connection refused" in error_msg:
|
|
242
|
+
return (False, f"Cannot connect to Spark master at '{master}'")
|
|
243
|
+
if "Initial job has not accepted any resources" in error_msg:
|
|
244
|
+
return (False,
|
|
245
|
+
f"Workers not accepting tasks at '{master}'.\n"
|
|
246
|
+
f"Check: spark.driver.host is set correctly for your network topology"
|
|
247
|
+
)
|
|
248
|
+
return (False, f"Standalone connection failed: {e}")
|
|
249
|
+
|
|
250
|
+
def get_cluster_info(self) -> Dict[str, Any]:
|
|
251
|
+
"""
|
|
252
|
+
Get information about the standalone cluster configuration.
|
|
253
|
+
|
|
254
|
+
:returns: Dictionary with cluster metadata
|
|
255
|
+
"""
|
|
256
|
+
return {
|
|
257
|
+
"platform": "standalone",
|
|
258
|
+
"master": self.config.get("master", "unknown"),
|
|
259
|
+
"executor_instances": self.config.get("spark.executor.instances", "dynamic"),
|
|
260
|
+
"executor_memory": self.config.get("spark.executor.memory", "default"),
|
|
261
|
+
"executor_cores": self.config.get("spark.executor.cores", "default"),
|
|
262
|
+
}
|
dbt/config/__init__.py
ADDED
dbt/config/catalogs.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from copy import deepcopy
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from dbt.artifacts.resources import Catalog, CatalogWriteIntegrationConfig
|
|
6
|
+
from dbt.clients.yaml_helper import load_yaml_text
|
|
7
|
+
from dbt.config.renderer import SecretRenderer
|
|
8
|
+
from dbt.constants import CATALOGS_FILE_NAME
|
|
9
|
+
from dbt.exceptions import YamlLoadError
|
|
10
|
+
from dbt_common.clients.system import load_file_contents
|
|
11
|
+
from dbt_common.exceptions import CompilationError, DbtValidationError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_catalogs_yml(project_dir: str, project_name: str) -> Dict[str, Any]:
|
|
15
|
+
path = os.path.join(project_dir, CATALOGS_FILE_NAME)
|
|
16
|
+
|
|
17
|
+
if os.path.isfile(path):
|
|
18
|
+
try:
|
|
19
|
+
contents = load_file_contents(path, strip=False)
|
|
20
|
+
yaml_content = load_yaml_text(contents)
|
|
21
|
+
|
|
22
|
+
if not yaml_content:
|
|
23
|
+
raise DbtValidationError(f"The file at {path} is empty")
|
|
24
|
+
|
|
25
|
+
return yaml_content
|
|
26
|
+
except DbtValidationError as e:
|
|
27
|
+
raise YamlLoadError(project_name=project_name, path=CATALOGS_FILE_NAME, exc=e)
|
|
28
|
+
|
|
29
|
+
return {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_single_catalog(raw_catalog: Dict[str, Any], renderer: SecretRenderer) -> Catalog:
|
|
33
|
+
try:
|
|
34
|
+
rendered_catalog = renderer.render_data(raw_catalog)
|
|
35
|
+
except CompilationError as exc:
|
|
36
|
+
raise DbtValidationError(str(exc)) from exc
|
|
37
|
+
|
|
38
|
+
Catalog.validate(rendered_catalog)
|
|
39
|
+
|
|
40
|
+
write_integrations = []
|
|
41
|
+
write_integration_names = set()
|
|
42
|
+
|
|
43
|
+
for raw_integration in rendered_catalog.get("write_integrations", []):
|
|
44
|
+
if raw_integration["name"] in write_integration_names:
|
|
45
|
+
raise DbtValidationError(
|
|
46
|
+
f"Catalog '{rendered_catalog['name']}' cannot have multiple 'write_integrations' with the same name: '{raw_integration['name']}'."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# We're going to let the adapter validate the integration config
|
|
50
|
+
write_integrations.append(
|
|
51
|
+
CatalogWriteIntegrationConfig(**raw_integration, catalog_name=raw_catalog["name"])
|
|
52
|
+
)
|
|
53
|
+
write_integration_names.add(raw_integration["name"])
|
|
54
|
+
|
|
55
|
+
# Validate + set default active_write_integration if unset
|
|
56
|
+
active_write_integration = rendered_catalog.get("active_write_integration")
|
|
57
|
+
valid_write_integration_names = [integration.name for integration in write_integrations]
|
|
58
|
+
|
|
59
|
+
if not active_write_integration:
|
|
60
|
+
if len(valid_write_integration_names) == 1:
|
|
61
|
+
active_write_integration = write_integrations[0].name
|
|
62
|
+
else:
|
|
63
|
+
raise DbtValidationError(
|
|
64
|
+
f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' when multiple 'write_integrations' are provided."
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
if active_write_integration not in valid_write_integration_names:
|
|
68
|
+
raise DbtValidationError(
|
|
69
|
+
f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' from its set of defined 'write_integrations': {valid_write_integration_names}. Got: '{active_write_integration}'."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return Catalog(
|
|
73
|
+
name=raw_catalog["name"],
|
|
74
|
+
active_write_integration=active_write_integration,
|
|
75
|
+
write_integrations=write_integrations,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def load_catalogs(project_dir: str, project_name: str, cli_vars: Dict[str, Any]) -> List[Catalog]:
|
|
80
|
+
raw_catalogs = load_catalogs_yml(project_dir, project_name).get("catalogs", [])
|
|
81
|
+
catalogs_renderer = SecretRenderer(cli_vars)
|
|
82
|
+
|
|
83
|
+
return [load_single_catalog(raw_catalog, catalogs_renderer) for raw_catalog in raw_catalogs]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_active_write_integration(catalog: Catalog) -> Optional[CatalogWriteIntegrationConfig]:
|
|
87
|
+
for integration in catalog.write_integrations:
|
|
88
|
+
if integration.name == catalog.active_write_integration:
|
|
89
|
+
active_integration = deepcopy(integration)
|
|
90
|
+
active_integration.catalog_name = active_integration.name
|
|
91
|
+
active_integration.name = catalog.name
|
|
92
|
+
return active_integration
|
|
93
|
+
|
|
94
|
+
return None
|