dvt-core 0.59.0a51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2660 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +60 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.py +273 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.py +1252 -0
- dbt/compute/metadata/__init__.py +63 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/catalog_store.py +1036 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.py +1020 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/spark_logger.py +272 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +472 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.py +408 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +888 -0
- dbt/config/project_utils.py +48 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +564 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +419 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_comprehensive_registry.py +1254 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/dvt_starter_project/README.md +15 -0
- dbt/include/dvt_starter_project/__init__.py +3 -0
- dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/dvt_project.yml +39 -0
- dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
- dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +122 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2208 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +506 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +513 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +1002 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +509 -0
- dbt/task/dvt_run.py +282 -0
- dbt/task/dvt_seed.py +806 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +1022 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.py +804 -0
- dbt/task/migrate.py +714 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.py +1489 -0
- dbt/task/profile_serve.py +662 -0
- dbt/task/retract.py +441 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1647 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +814 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +271 -0
- dvt_cli/__init__.py +158 -0
- dvt_core-0.59.0a51.dist-info/METADATA +288 -0
- dvt_core-0.59.0a51.dist-info/RECORD +299 -0
- dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
- dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
- dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Connection Strategy for Spark Engines
|
|
3
|
+
|
|
4
|
+
Defines the abstract interface for different Spark connection strategies.
|
|
5
|
+
Uses composition over inheritance for flexible platform support.
|
|
6
|
+
|
|
7
|
+
v0.5.98: Added JAR provisioning and connectivity testing methods.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pyspark.sql import SparkSession
|
|
15
|
+
|
|
16
|
+
PYSPARK_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PYSPARK_AVAILABLE = False
|
|
19
|
+
SparkSession = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseConnectionStrategy(ABC):
|
|
23
|
+
"""
|
|
24
|
+
Abstract base class for Spark connection strategies.
|
|
25
|
+
|
|
26
|
+
Different strategies implement different ways to connect to Spark:
|
|
27
|
+
- LocalStrategy: Embedded PySpark (in-process)
|
|
28
|
+
- DatabricksStrategy: Databricks Connect (remote cluster)
|
|
29
|
+
- EMRStrategy: AWS EMR cluster
|
|
30
|
+
- DataprocStrategy: GCP Dataproc
|
|
31
|
+
- StandaloneStrategy: Self-managed Spark clusters
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: Dict[str, Any], app_name: str = "DVT-Compute"):
|
|
35
|
+
"""
|
|
36
|
+
Initialize connection strategy.
|
|
37
|
+
|
|
38
|
+
:param config: Strategy-specific configuration
|
|
39
|
+
:param app_name: Spark application name
|
|
40
|
+
"""
|
|
41
|
+
self.config = config
|
|
42
|
+
self.app_name = app_name
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def get_spark_session(self) -> SparkSession:
|
|
46
|
+
"""
|
|
47
|
+
Create and return a SparkSession.
|
|
48
|
+
|
|
49
|
+
:returns: Initialized SparkSession
|
|
50
|
+
:raises DbtRuntimeError: If session creation fails
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def validate_config(self) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Validate strategy-specific configuration.
|
|
58
|
+
|
|
59
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Estimate cost for running on this platform.
|
|
66
|
+
|
|
67
|
+
Default implementation returns 0.0 (free). Override for cloud platforms.
|
|
68
|
+
|
|
69
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
70
|
+
:returns: Estimated cost in USD
|
|
71
|
+
"""
|
|
72
|
+
return 0.0
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Clean up Spark session.
|
|
78
|
+
|
|
79
|
+
:param spark: SparkSession to clean up (may be None)
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def get_platform_name(self) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Get human-readable platform name.
|
|
86
|
+
|
|
87
|
+
:returns: Platform name (e.g., "local", "databricks", "emr")
|
|
88
|
+
"""
|
|
89
|
+
return self.__class__.__name__.replace("Strategy", "").lower()
|
|
90
|
+
|
|
91
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
92
|
+
"""
|
|
93
|
+
Get Spark configuration for JDBC JAR provisioning.
|
|
94
|
+
|
|
95
|
+
Default implementation returns empty dict. Override in subclasses
|
|
96
|
+
to provide platform-specific JAR configuration.
|
|
97
|
+
|
|
98
|
+
Local platforms use spark.jars (local file paths).
|
|
99
|
+
Remote platforms use spark.jars.packages (Maven coordinates).
|
|
100
|
+
|
|
101
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
102
|
+
:returns: Dictionary of Spark config keys/values (e.g., {"spark.jars": "..."})
|
|
103
|
+
"""
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
107
|
+
"""
|
|
108
|
+
Test basic connectivity to the Spark cluster.
|
|
109
|
+
|
|
110
|
+
Creates a session, runs a simple query, and returns status.
|
|
111
|
+
Override for platform-specific connectivity testing.
|
|
112
|
+
|
|
113
|
+
:returns: Tuple of (success, message)
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
spark = self.get_spark_session()
|
|
117
|
+
# Run a simple SQL query to verify connectivity
|
|
118
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
119
|
+
return (True, "Session created and SQL test passed")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return (False, str(e))
|
|
122
|
+
|
|
123
|
+
def test_jdbc_connectivity(
|
|
124
|
+
self,
|
|
125
|
+
jdbc_url: str,
|
|
126
|
+
properties: Dict[str, str],
|
|
127
|
+
table_or_query: str = "(SELECT 1 AS test) AS t",
|
|
128
|
+
) -> Tuple[bool, str]:
|
|
129
|
+
"""
|
|
130
|
+
Test JDBC connectivity through the Spark cluster.
|
|
131
|
+
|
|
132
|
+
Creates a session and attempts to read from a JDBC source.
|
|
133
|
+
This verifies that JDBC drivers are properly configured.
|
|
134
|
+
|
|
135
|
+
:param jdbc_url: JDBC connection URL
|
|
136
|
+
:param properties: JDBC connection properties (user, password, driver)
|
|
137
|
+
:param table_or_query: Table name or SQL query wrapped in parentheses
|
|
138
|
+
:returns: Tuple of (success, message)
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
spark = self.get_spark_session()
|
|
142
|
+
|
|
143
|
+
# Attempt JDBC read
|
|
144
|
+
df = (
|
|
145
|
+
spark.read.format("jdbc")
|
|
146
|
+
.option("url", jdbc_url)
|
|
147
|
+
.option("dbtable", table_or_query)
|
|
148
|
+
.options(**properties)
|
|
149
|
+
.load()
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Force evaluation
|
|
153
|
+
row_count = df.count()
|
|
154
|
+
return (True, f"JDBC read successful ({row_count} rows)")
|
|
155
|
+
except Exception as e:
|
|
156
|
+
error_msg = str(e)
|
|
157
|
+
# Provide helpful error messages for common issues
|
|
158
|
+
if "ClassNotFoundException" in error_msg:
|
|
159
|
+
return (False, f"JDBC driver not found: {error_msg}")
|
|
160
|
+
elif "No suitable driver" in error_msg:
|
|
161
|
+
return (False, f"JDBC driver not loaded: {error_msg}")
|
|
162
|
+
elif "Authentication" in error_msg.lower() or "password" in error_msg.lower():
|
|
163
|
+
return (False, f"Authentication failed: {error_msg}")
|
|
164
|
+
else:
|
|
165
|
+
return (False, f"JDBC test failed: {error_msg}")
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GCP Dataproc Spark Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides connection to Google Cloud Dataproc Spark clusters.
|
|
5
|
+
|
|
6
|
+
v0.5.98: New strategy for GCP Dataproc clusters with Maven-based JAR provisioning.
|
|
7
|
+
|
|
8
|
+
Configuration:
|
|
9
|
+
{
|
|
10
|
+
"project": "my-gcp-project", # Required: GCP project ID
|
|
11
|
+
"region": "us-central1", # Required: Dataproc region
|
|
12
|
+
"cluster": "my-dataproc-cluster", # Required: Cluster name
|
|
13
|
+
"spark.driver.memory": "4g", # Optional: driver memory
|
|
14
|
+
"spark.executor.memory": "8g", # Optional: executor memory
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Requirements:
|
|
18
|
+
- GCP Dataproc cluster must be running
|
|
19
|
+
- gcloud SDK configured (gcloud auth login)
|
|
20
|
+
- Dataproc connector or direct YARN access
|
|
21
|
+
|
|
22
|
+
Cost Estimate:
|
|
23
|
+
- Typical 5-node Dataproc cluster: ~$1.00/hr (n1-standard-4 instances)
|
|
24
|
+
- Dataproc pricing includes Spark/Hadoop runtime at no extra cost
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
28
|
+
|
|
29
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
30
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from pyspark.sql import SparkSession
|
|
34
|
+
|
|
35
|
+
PYSPARK_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
PYSPARK_AVAILABLE = False
|
|
38
|
+
SparkSession = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DataprocStrategy(BaseConnectionStrategy):
|
|
42
|
+
"""
|
|
43
|
+
GCP Dataproc Spark cluster connection strategy.
|
|
44
|
+
|
|
45
|
+
Connects to Dataproc clusters using YARN as the resource manager.
|
|
46
|
+
Uses spark.jars.packages for JDBC JAR provisioning.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def validate_config(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate Dataproc strategy configuration.
|
|
52
|
+
|
|
53
|
+
Required:
|
|
54
|
+
- project: GCP project ID
|
|
55
|
+
- region: Dataproc region
|
|
56
|
+
- cluster: Cluster name
|
|
57
|
+
|
|
58
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
59
|
+
"""
|
|
60
|
+
if not isinstance(self.config, dict):
|
|
61
|
+
raise DbtRuntimeError(
|
|
62
|
+
f"Dataproc config must be a dictionary, got {type(self.config)}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Check required fields
|
|
66
|
+
required_fields = ["project", "region", "cluster"]
|
|
67
|
+
missing = [f for f in required_fields if f not in self.config]
|
|
68
|
+
if missing:
|
|
69
|
+
raise DbtRuntimeError(
|
|
70
|
+
f"Dataproc config missing required fields: {', '.join(missing)}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
74
|
+
"""
|
|
75
|
+
Create Spark session connected to Dataproc cluster.
|
|
76
|
+
|
|
77
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
78
|
+
:returns: Initialized SparkSession connected to Dataproc
|
|
79
|
+
:raises DbtRuntimeError: If session creation fails
|
|
80
|
+
"""
|
|
81
|
+
if not PYSPARK_AVAILABLE:
|
|
82
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
86
|
+
|
|
87
|
+
# Set YARN master for Dataproc
|
|
88
|
+
builder = builder.master("yarn")
|
|
89
|
+
|
|
90
|
+
# Get JDBC JAR config
|
|
91
|
+
if adapter_types is None:
|
|
92
|
+
from dbt.compute.jar_provisioning import get_required_adapter_types
|
|
93
|
+
adapter_types = get_required_adapter_types()
|
|
94
|
+
|
|
95
|
+
if adapter_types:
|
|
96
|
+
jar_config = self.get_jar_provisioning_config(adapter_types)
|
|
97
|
+
for key, value in jar_config.items():
|
|
98
|
+
builder = builder.config(key, value)
|
|
99
|
+
|
|
100
|
+
# Apply user-provided configs
|
|
101
|
+
for key, value in self.config.items():
|
|
102
|
+
if key not in ("project", "region", "cluster"):
|
|
103
|
+
builder = builder.config(key, value)
|
|
104
|
+
|
|
105
|
+
# Default Dataproc optimizations
|
|
106
|
+
default_configs = {
|
|
107
|
+
"spark.submit.deployMode": "client",
|
|
108
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
109
|
+
"spark.sql.execution.arrow.pyspark.enabled": "true",
|
|
110
|
+
}
|
|
111
|
+
for key, value in default_configs.items():
|
|
112
|
+
if key not in self.config:
|
|
113
|
+
builder = builder.config(key, value)
|
|
114
|
+
|
|
115
|
+
# Create session
|
|
116
|
+
spark = builder.getOrCreate()
|
|
117
|
+
spark.sparkContext.setLogLevel("WARN")
|
|
118
|
+
|
|
119
|
+
return spark
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
error_msg = str(e)
|
|
123
|
+
if "Connection refused" in error_msg:
|
|
124
|
+
raise DbtRuntimeError(
|
|
125
|
+
f"Cannot connect to Dataproc cluster '{self.config.get('cluster')}'. "
|
|
126
|
+
f"Ensure the cluster is running. Error: {error_msg}"
|
|
127
|
+
) from e
|
|
128
|
+
raise DbtRuntimeError(f"Failed to create Dataproc Spark session: {error_msg}") from e
|
|
129
|
+
|
|
130
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Clean up Spark session.
|
|
133
|
+
|
|
134
|
+
For Dataproc, we stop the application but the cluster continues running.
|
|
135
|
+
|
|
136
|
+
:param spark: SparkSession to clean up
|
|
137
|
+
"""
|
|
138
|
+
if spark:
|
|
139
|
+
try:
|
|
140
|
+
spark.stop()
|
|
141
|
+
except Exception:
|
|
142
|
+
pass # Best effort cleanup
|
|
143
|
+
|
|
144
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
145
|
+
"""
|
|
146
|
+
Estimate cost for Dataproc execution.
|
|
147
|
+
|
|
148
|
+
Based on typical 5-node Dataproc cluster with n1-standard-4 instances.
|
|
149
|
+
|
|
150
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
151
|
+
:returns: Estimated cost in USD
|
|
152
|
+
"""
|
|
153
|
+
# Typical Dataproc cluster: 5x n1-standard-4 @ ~$0.19/hr each = ~$0.95/hr total
|
|
154
|
+
# Plus Dataproc fee: $0.01/vCPU/hr = ~$0.20/hr for 20 vCPUs
|
|
155
|
+
hourly_cost = 1.15
|
|
156
|
+
hours = duration_minutes / 60.0
|
|
157
|
+
return round(hourly_cost * hours, 2)
|
|
158
|
+
|
|
159
|
+
def get_platform_name(self) -> str:
|
|
160
|
+
"""Get platform name."""
|
|
161
|
+
return "dataproc"
|
|
162
|
+
|
|
163
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
164
|
+
"""
|
|
165
|
+
Get Spark config for JDBC JAR provisioning using Maven coordinates.
|
|
166
|
+
|
|
167
|
+
Dataproc clusters download JDBC drivers from Maven Central at session startup.
|
|
168
|
+
|
|
169
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
170
|
+
:returns: Dictionary with spark.jars.packages config
|
|
171
|
+
"""
|
|
172
|
+
from dbt.compute.jar_provisioning import RemoteJARProvisioning
|
|
173
|
+
|
|
174
|
+
provisioning = RemoteJARProvisioning()
|
|
175
|
+
return provisioning.get_spark_config(adapter_types)
|
|
176
|
+
|
|
177
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
178
|
+
"""
|
|
179
|
+
Test connectivity to Dataproc cluster.
|
|
180
|
+
|
|
181
|
+
:returns: Tuple of (success, message)
|
|
182
|
+
"""
|
|
183
|
+
if not PYSPARK_AVAILABLE:
|
|
184
|
+
return (False, "PySpark not installed")
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
spark = self.get_spark_session()
|
|
188
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
189
|
+
return (True, "Dataproc session created and SQL test passed")
|
|
190
|
+
except Exception as e:
|
|
191
|
+
error_msg = str(e)
|
|
192
|
+
if "Connection refused" in error_msg:
|
|
193
|
+
return (False, "Cannot connect to Dataproc cluster (connection refused)")
|
|
194
|
+
return (False, f"Dataproc connection failed: {e}")
|
|
195
|
+
|
|
196
|
+
def get_cluster_info(self) -> Dict[str, Any]:
|
|
197
|
+
"""
|
|
198
|
+
Get information about the Dataproc configuration.
|
|
199
|
+
|
|
200
|
+
:returns: Dictionary with cluster metadata
|
|
201
|
+
"""
|
|
202
|
+
return {
|
|
203
|
+
"platform": "dataproc",
|
|
204
|
+
"project": self.config.get("project", "unknown"),
|
|
205
|
+
"region": self.config.get("region", "unknown"),
|
|
206
|
+
"cluster": self.config.get("cluster", "unknown"),
|
|
207
|
+
}
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AWS EMR (Elastic MapReduce) Spark Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides connection to AWS EMR Spark clusters via YARN.
|
|
5
|
+
|
|
6
|
+
v0.5.98: New strategy for AWS EMR clusters with Maven-based JAR provisioning.
|
|
7
|
+
|
|
8
|
+
Configuration:
|
|
9
|
+
{
|
|
10
|
+
"master": "yarn", # Required: YARN resource manager
|
|
11
|
+
"spark.submit.deployMode": "client", # Optional: client or cluster
|
|
12
|
+
"spark.driver.memory": "4g", # Optional: driver memory
|
|
13
|
+
"spark.executor.memory": "8g", # Optional: executor memory
|
|
14
|
+
"spark.executor.instances": "4", # Optional: number of executors
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Requirements:
|
|
18
|
+
- AWS EMR cluster must be running
|
|
19
|
+
- AWS credentials configured (aws configure or IAM role)
|
|
20
|
+
- Spark must be accessible from client machine (e.g., via SSH tunnel or VPN)
|
|
21
|
+
|
|
22
|
+
Cost Estimate:
|
|
23
|
+
- Typical 5-node EMR cluster: ~$1.20/hr (m5.xlarge instances)
|
|
24
|
+
- On-demand pricing varies by instance type and region
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
28
|
+
|
|
29
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
30
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from pyspark.sql import SparkSession
|
|
34
|
+
|
|
35
|
+
PYSPARK_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
PYSPARK_AVAILABLE = False
|
|
38
|
+
SparkSession = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EMRStrategy(BaseConnectionStrategy):
|
|
42
|
+
"""
|
|
43
|
+
AWS EMR Spark cluster connection strategy.
|
|
44
|
+
|
|
45
|
+
Connects to EMR clusters using YARN as the resource manager.
|
|
46
|
+
Uses spark.jars.packages for JDBC JAR provisioning.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def validate_config(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate EMR strategy configuration.
|
|
52
|
+
|
|
53
|
+
Required:
|
|
54
|
+
- master: Must be "yarn" for EMR
|
|
55
|
+
|
|
56
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(self.config, dict):
|
|
59
|
+
raise DbtRuntimeError(
|
|
60
|
+
f"EMR config must be a dictionary, got {type(self.config)}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Check master is yarn
|
|
64
|
+
master = self.config.get("master", "")
|
|
65
|
+
if master.lower() != "yarn":
|
|
66
|
+
raise DbtRuntimeError(
|
|
67
|
+
f"EMR config requires master='yarn', got: {master}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
71
|
+
"""
|
|
72
|
+
Create Spark session connected to EMR cluster via YARN.
|
|
73
|
+
|
|
74
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
75
|
+
:returns: Initialized SparkSession connected to EMR
|
|
76
|
+
:raises DbtRuntimeError: If session creation fails
|
|
77
|
+
"""
|
|
78
|
+
if not PYSPARK_AVAILABLE:
|
|
79
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
83
|
+
|
|
84
|
+
# Set YARN master
|
|
85
|
+
builder = builder.master("yarn")
|
|
86
|
+
|
|
87
|
+
# Get JDBC JAR config
|
|
88
|
+
if adapter_types is None:
|
|
89
|
+
from dbt.compute.jar_provisioning import get_required_adapter_types
|
|
90
|
+
adapter_types = get_required_adapter_types()
|
|
91
|
+
|
|
92
|
+
if adapter_types:
|
|
93
|
+
jar_config = self.get_jar_provisioning_config(adapter_types)
|
|
94
|
+
for key, value in jar_config.items():
|
|
95
|
+
builder = builder.config(key, value)
|
|
96
|
+
|
|
97
|
+
# Apply user-provided configs
|
|
98
|
+
for key, value in self.config.items():
|
|
99
|
+
if key != "master": # master already set
|
|
100
|
+
builder = builder.config(key, value)
|
|
101
|
+
|
|
102
|
+
# Default EMR optimizations
|
|
103
|
+
default_configs = {
|
|
104
|
+
"spark.submit.deployMode": "client",
|
|
105
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
106
|
+
"spark.sql.execution.arrow.pyspark.enabled": "true",
|
|
107
|
+
}
|
|
108
|
+
for key, value in default_configs.items():
|
|
109
|
+
if key not in self.config:
|
|
110
|
+
builder = builder.config(key, value)
|
|
111
|
+
|
|
112
|
+
# Create session
|
|
113
|
+
spark = builder.getOrCreate()
|
|
114
|
+
spark.sparkContext.setLogLevel("WARN")
|
|
115
|
+
|
|
116
|
+
return spark
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
error_msg = str(e)
|
|
120
|
+
if "Connection refused" in error_msg:
|
|
121
|
+
raise DbtRuntimeError(
|
|
122
|
+
f"Cannot connect to EMR cluster. Ensure the cluster is running "
|
|
123
|
+
f"and accessible from this machine. Error: {error_msg}"
|
|
124
|
+
) from e
|
|
125
|
+
raise DbtRuntimeError(f"Failed to create EMR Spark session: {error_msg}") from e
|
|
126
|
+
|
|
127
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
128
|
+
"""
|
|
129
|
+
Clean up Spark session.
|
|
130
|
+
|
|
131
|
+
For EMR, we stop the application but the cluster continues running.
|
|
132
|
+
|
|
133
|
+
:param spark: SparkSession to clean up
|
|
134
|
+
"""
|
|
135
|
+
if spark:
|
|
136
|
+
try:
|
|
137
|
+
spark.stop()
|
|
138
|
+
except Exception:
|
|
139
|
+
pass # Best effort cleanup
|
|
140
|
+
|
|
141
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
142
|
+
"""
|
|
143
|
+
Estimate cost for EMR execution.
|
|
144
|
+
|
|
145
|
+
Based on typical 5-node EMR cluster with m5.xlarge instances.
|
|
146
|
+
|
|
147
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
148
|
+
:returns: Estimated cost in USD
|
|
149
|
+
"""
|
|
150
|
+
# Typical EMR cluster: 5x m5.xlarge @ ~$0.24/hr each = ~$1.20/hr total
|
|
151
|
+
hourly_cost = 1.20
|
|
152
|
+
hours = duration_minutes / 60.0
|
|
153
|
+
return round(hourly_cost * hours, 2)
|
|
154
|
+
|
|
155
|
+
def get_platform_name(self) -> str:
|
|
156
|
+
"""Get platform name."""
|
|
157
|
+
return "emr"
|
|
158
|
+
|
|
159
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
160
|
+
"""
|
|
161
|
+
Get Spark config for JDBC JAR provisioning using Maven coordinates.
|
|
162
|
+
|
|
163
|
+
EMR clusters download JDBC drivers from Maven Central at session startup.
|
|
164
|
+
|
|
165
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
166
|
+
:returns: Dictionary with spark.jars.packages config
|
|
167
|
+
"""
|
|
168
|
+
from dbt.compute.jar_provisioning import RemoteJARProvisioning
|
|
169
|
+
|
|
170
|
+
provisioning = RemoteJARProvisioning()
|
|
171
|
+
return provisioning.get_spark_config(adapter_types)
|
|
172
|
+
|
|
173
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
174
|
+
"""
|
|
175
|
+
Test connectivity to EMR cluster.
|
|
176
|
+
|
|
177
|
+
:returns: Tuple of (success, message)
|
|
178
|
+
"""
|
|
179
|
+
if not PYSPARK_AVAILABLE:
|
|
180
|
+
return (False, "PySpark not installed")
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
spark = self.get_spark_session()
|
|
184
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
185
|
+
return (True, "EMR session created and SQL test passed")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
error_msg = str(e)
|
|
188
|
+
if "Connection refused" in error_msg:
|
|
189
|
+
return (False, "Cannot connect to EMR cluster (connection refused)")
|
|
190
|
+
return (False, f"EMR connection failed: {e}")
|
|
191
|
+
|
|
192
|
+
def get_cluster_info(self) -> Dict[str, Any]:
|
|
193
|
+
"""
|
|
194
|
+
Get information about the EMR configuration.
|
|
195
|
+
|
|
196
|
+
:returns: Dictionary with cluster metadata
|
|
197
|
+
"""
|
|
198
|
+
return {
|
|
199
|
+
"platform": "emr",
|
|
200
|
+
"master": self.config.get("master", "yarn"),
|
|
201
|
+
"deploy_mode": self.config.get("spark.submit.deployMode", "client"),
|
|
202
|
+
"executor_instances": self.config.get("spark.executor.instances", "dynamic"),
|
|
203
|
+
}
|