dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GCP Dataproc Spark Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides connection to Google Cloud Dataproc Spark clusters.
|
|
5
|
+
|
|
6
|
+
v0.5.98: New strategy for GCP Dataproc clusters with Maven-based JAR provisioning.
|
|
7
|
+
|
|
8
|
+
Configuration:
|
|
9
|
+
{
|
|
10
|
+
"project": "my-gcp-project", # Required: GCP project ID
|
|
11
|
+
"region": "us-central1", # Required: Dataproc region
|
|
12
|
+
"cluster": "my-dataproc-cluster", # Required: Cluster name
|
|
13
|
+
"spark.driver.memory": "4g", # Optional: driver memory
|
|
14
|
+
"spark.executor.memory": "8g", # Optional: executor memory
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Requirements:
|
|
18
|
+
- GCP Dataproc cluster must be running
|
|
19
|
+
- gcloud SDK configured (gcloud auth login)
|
|
20
|
+
- Dataproc connector or direct YARN access
|
|
21
|
+
|
|
22
|
+
Cost Estimate:
|
|
23
|
+
- Typical 5-node Dataproc cluster: ~$1.00/hr (n1-standard-4 instances)
|
|
24
|
+
- Dataproc pricing includes Spark/Hadoop runtime at no extra cost
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
28
|
+
|
|
29
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
30
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from pyspark.sql import SparkSession
|
|
34
|
+
|
|
35
|
+
PYSPARK_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
PYSPARK_AVAILABLE = False
|
|
38
|
+
SparkSession = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DataprocStrategy(BaseConnectionStrategy):
|
|
42
|
+
"""
|
|
43
|
+
GCP Dataproc Spark cluster connection strategy.
|
|
44
|
+
|
|
45
|
+
Connects to Dataproc clusters using YARN as the resource manager.
|
|
46
|
+
Uses spark.jars.packages for JDBC JAR provisioning.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def validate_config(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate Dataproc strategy configuration.
|
|
52
|
+
|
|
53
|
+
Required:
|
|
54
|
+
- project: GCP project ID
|
|
55
|
+
- region: Dataproc region
|
|
56
|
+
- cluster: Cluster name
|
|
57
|
+
|
|
58
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
59
|
+
"""
|
|
60
|
+
if not isinstance(self.config, dict):
|
|
61
|
+
raise DbtRuntimeError(
|
|
62
|
+
f"Dataproc config must be a dictionary, got {type(self.config)}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Check required fields
|
|
66
|
+
required_fields = ["project", "region", "cluster"]
|
|
67
|
+
missing = [f for f in required_fields if f not in self.config]
|
|
68
|
+
if missing:
|
|
69
|
+
raise DbtRuntimeError(
|
|
70
|
+
f"Dataproc config missing required fields: {', '.join(missing)}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
74
|
+
"""
|
|
75
|
+
Create Spark session connected to Dataproc cluster.
|
|
76
|
+
|
|
77
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
78
|
+
:returns: Initialized SparkSession connected to Dataproc
|
|
79
|
+
:raises DbtRuntimeError: If session creation fails
|
|
80
|
+
"""
|
|
81
|
+
if not PYSPARK_AVAILABLE:
|
|
82
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
86
|
+
|
|
87
|
+
# Set YARN master for Dataproc
|
|
88
|
+
builder = builder.master("yarn")
|
|
89
|
+
|
|
90
|
+
# Get JDBC JAR config
|
|
91
|
+
if adapter_types is None:
|
|
92
|
+
from dbt.compute.jar_provisioning import get_required_adapter_types
|
|
93
|
+
adapter_types = get_required_adapter_types()
|
|
94
|
+
|
|
95
|
+
if adapter_types:
|
|
96
|
+
jar_config = self.get_jar_provisioning_config(adapter_types)
|
|
97
|
+
for key, value in jar_config.items():
|
|
98
|
+
builder = builder.config(key, value)
|
|
99
|
+
|
|
100
|
+
# Apply user-provided configs
|
|
101
|
+
for key, value in self.config.items():
|
|
102
|
+
if key not in ("project", "region", "cluster"):
|
|
103
|
+
builder = builder.config(key, value)
|
|
104
|
+
|
|
105
|
+
# Default Dataproc optimizations
|
|
106
|
+
default_configs = {
|
|
107
|
+
"spark.submit.deployMode": "client",
|
|
108
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
109
|
+
"spark.sql.execution.arrow.pyspark.enabled": "true",
|
|
110
|
+
}
|
|
111
|
+
for key, value in default_configs.items():
|
|
112
|
+
if key not in self.config:
|
|
113
|
+
builder = builder.config(key, value)
|
|
114
|
+
|
|
115
|
+
# Create session
|
|
116
|
+
spark = builder.getOrCreate()
|
|
117
|
+
spark.sparkContext.setLogLevel("WARN")
|
|
118
|
+
|
|
119
|
+
return spark
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
error_msg = str(e)
|
|
123
|
+
if "Connection refused" in error_msg:
|
|
124
|
+
raise DbtRuntimeError(
|
|
125
|
+
f"Cannot connect to Dataproc cluster '{self.config.get('cluster')}'. "
|
|
126
|
+
f"Ensure the cluster is running. Error: {error_msg}"
|
|
127
|
+
) from e
|
|
128
|
+
raise DbtRuntimeError(f"Failed to create Dataproc Spark session: {error_msg}") from e
|
|
129
|
+
|
|
130
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Clean up Spark session.
|
|
133
|
+
|
|
134
|
+
For Dataproc, we stop the application but the cluster continues running.
|
|
135
|
+
|
|
136
|
+
:param spark: SparkSession to clean up
|
|
137
|
+
"""
|
|
138
|
+
if spark:
|
|
139
|
+
try:
|
|
140
|
+
spark.stop()
|
|
141
|
+
except Exception:
|
|
142
|
+
pass # Best effort cleanup
|
|
143
|
+
|
|
144
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
145
|
+
"""
|
|
146
|
+
Estimate cost for Dataproc execution.
|
|
147
|
+
|
|
148
|
+
Based on typical 5-node Dataproc cluster with n1-standard-4 instances.
|
|
149
|
+
|
|
150
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
151
|
+
:returns: Estimated cost in USD
|
|
152
|
+
"""
|
|
153
|
+
# Typical Dataproc cluster: 5x n1-standard-4 @ ~$0.19/hr each = ~$0.95/hr total
|
|
154
|
+
# Plus Dataproc fee: $0.01/vCPU/hr = ~$0.20/hr for 20 vCPUs
|
|
155
|
+
hourly_cost = 1.15
|
|
156
|
+
hours = duration_minutes / 60.0
|
|
157
|
+
return round(hourly_cost * hours, 2)
|
|
158
|
+
|
|
159
|
+
def get_platform_name(self) -> str:
|
|
160
|
+
"""Get platform name."""
|
|
161
|
+
return "dataproc"
|
|
162
|
+
|
|
163
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
164
|
+
"""
|
|
165
|
+
Get Spark config for JDBC JAR provisioning using Maven coordinates.
|
|
166
|
+
|
|
167
|
+
Dataproc clusters download JDBC drivers from Maven Central at session startup.
|
|
168
|
+
|
|
169
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
170
|
+
:returns: Dictionary with spark.jars.packages config
|
|
171
|
+
"""
|
|
172
|
+
from dbt.compute.jar_provisioning import RemoteJARProvisioning
|
|
173
|
+
|
|
174
|
+
provisioning = RemoteJARProvisioning()
|
|
175
|
+
return provisioning.get_spark_config(adapter_types)
|
|
176
|
+
|
|
177
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
178
|
+
"""
|
|
179
|
+
Test connectivity to Dataproc cluster.
|
|
180
|
+
|
|
181
|
+
:returns: Tuple of (success, message)
|
|
182
|
+
"""
|
|
183
|
+
if not PYSPARK_AVAILABLE:
|
|
184
|
+
return (False, "PySpark not installed")
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
spark = self.get_spark_session()
|
|
188
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
189
|
+
return (True, "Dataproc session created and SQL test passed")
|
|
190
|
+
except Exception as e:
|
|
191
|
+
error_msg = str(e)
|
|
192
|
+
if "Connection refused" in error_msg:
|
|
193
|
+
return (False, "Cannot connect to Dataproc cluster (connection refused)")
|
|
194
|
+
return (False, f"Dataproc connection failed: {e}")
|
|
195
|
+
|
|
196
|
+
def get_cluster_info(self) -> Dict[str, Any]:
|
|
197
|
+
"""
|
|
198
|
+
Get information about the Dataproc configuration.
|
|
199
|
+
|
|
200
|
+
:returns: Dictionary with cluster metadata
|
|
201
|
+
"""
|
|
202
|
+
return {
|
|
203
|
+
"platform": "dataproc",
|
|
204
|
+
"project": self.config.get("project", "unknown"),
|
|
205
|
+
"region": self.config.get("region", "unknown"),
|
|
206
|
+
"cluster": self.config.get("cluster", "unknown"),
|
|
207
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AWS EMR (Elastic MapReduce) Spark Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides connection to AWS EMR Spark clusters via YARN.
|
|
5
|
+
|
|
6
|
+
v0.5.98: New strategy for AWS EMR clusters with Maven-based JAR provisioning.
|
|
7
|
+
|
|
8
|
+
Configuration:
|
|
9
|
+
{
|
|
10
|
+
"master": "yarn", # Required: YARN resource manager
|
|
11
|
+
"spark.submit.deployMode": "client", # Optional: client or cluster
|
|
12
|
+
"spark.driver.memory": "4g", # Optional: driver memory
|
|
13
|
+
"spark.executor.memory": "8g", # Optional: executor memory
|
|
14
|
+
"spark.executor.instances": "4", # Optional: number of executors
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Requirements:
|
|
18
|
+
- AWS EMR cluster must be running
|
|
19
|
+
- AWS credentials configured (aws configure or IAM role)
|
|
20
|
+
- Spark must be accessible from client machine (e.g., via SSH tunnel or VPN)
|
|
21
|
+
|
|
22
|
+
Cost Estimate:
|
|
23
|
+
- Typical 5-node EMR cluster: ~$1.20/hr (m5.xlarge instances)
|
|
24
|
+
- On-demand pricing varies by instance type and region
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
28
|
+
|
|
29
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
30
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from pyspark.sql import SparkSession
|
|
34
|
+
|
|
35
|
+
PYSPARK_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
PYSPARK_AVAILABLE = False
|
|
38
|
+
SparkSession = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EMRStrategy(BaseConnectionStrategy):
|
|
42
|
+
"""
|
|
43
|
+
AWS EMR Spark cluster connection strategy.
|
|
44
|
+
|
|
45
|
+
Connects to EMR clusters using YARN as the resource manager.
|
|
46
|
+
Uses spark.jars.packages for JDBC JAR provisioning.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def validate_config(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate EMR strategy configuration.
|
|
52
|
+
|
|
53
|
+
Required:
|
|
54
|
+
- master: Must be "yarn" for EMR
|
|
55
|
+
|
|
56
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(self.config, dict):
|
|
59
|
+
raise DbtRuntimeError(
|
|
60
|
+
f"EMR config must be a dictionary, got {type(self.config)}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Check master is yarn
|
|
64
|
+
master = self.config.get("master", "")
|
|
65
|
+
if master.lower() != "yarn":
|
|
66
|
+
raise DbtRuntimeError(
|
|
67
|
+
f"EMR config requires master='yarn', got: {master}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
71
|
+
"""
|
|
72
|
+
Create Spark session connected to EMR cluster via YARN.
|
|
73
|
+
|
|
74
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
75
|
+
:returns: Initialized SparkSession connected to EMR
|
|
76
|
+
:raises DbtRuntimeError: If session creation fails
|
|
77
|
+
"""
|
|
78
|
+
if not PYSPARK_AVAILABLE:
|
|
79
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
83
|
+
|
|
84
|
+
# Set YARN master
|
|
85
|
+
builder = builder.master("yarn")
|
|
86
|
+
|
|
87
|
+
# Get JDBC JAR config
|
|
88
|
+
if adapter_types is None:
|
|
89
|
+
from dbt.compute.jar_provisioning import get_required_adapter_types
|
|
90
|
+
adapter_types = get_required_adapter_types()
|
|
91
|
+
|
|
92
|
+
if adapter_types:
|
|
93
|
+
jar_config = self.get_jar_provisioning_config(adapter_types)
|
|
94
|
+
for key, value in jar_config.items():
|
|
95
|
+
builder = builder.config(key, value)
|
|
96
|
+
|
|
97
|
+
# Apply user-provided configs
|
|
98
|
+
for key, value in self.config.items():
|
|
99
|
+
if key != "master": # master already set
|
|
100
|
+
builder = builder.config(key, value)
|
|
101
|
+
|
|
102
|
+
# Default EMR optimizations
|
|
103
|
+
default_configs = {
|
|
104
|
+
"spark.submit.deployMode": "client",
|
|
105
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
106
|
+
"spark.sql.execution.arrow.pyspark.enabled": "true",
|
|
107
|
+
}
|
|
108
|
+
for key, value in default_configs.items():
|
|
109
|
+
if key not in self.config:
|
|
110
|
+
builder = builder.config(key, value)
|
|
111
|
+
|
|
112
|
+
# Create session
|
|
113
|
+
spark = builder.getOrCreate()
|
|
114
|
+
spark.sparkContext.setLogLevel("WARN")
|
|
115
|
+
|
|
116
|
+
return spark
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
error_msg = str(e)
|
|
120
|
+
if "Connection refused" in error_msg:
|
|
121
|
+
raise DbtRuntimeError(
|
|
122
|
+
f"Cannot connect to EMR cluster. Ensure the cluster is running "
|
|
123
|
+
f"and accessible from this machine. Error: {error_msg}"
|
|
124
|
+
) from e
|
|
125
|
+
raise DbtRuntimeError(f"Failed to create EMR Spark session: {error_msg}") from e
|
|
126
|
+
|
|
127
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
128
|
+
"""
|
|
129
|
+
Clean up Spark session.
|
|
130
|
+
|
|
131
|
+
For EMR, we stop the application but the cluster continues running.
|
|
132
|
+
|
|
133
|
+
:param spark: SparkSession to clean up
|
|
134
|
+
"""
|
|
135
|
+
if spark:
|
|
136
|
+
try:
|
|
137
|
+
spark.stop()
|
|
138
|
+
except Exception:
|
|
139
|
+
pass # Best effort cleanup
|
|
140
|
+
|
|
141
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
142
|
+
"""
|
|
143
|
+
Estimate cost for EMR execution.
|
|
144
|
+
|
|
145
|
+
Based on typical 5-node EMR cluster with m5.xlarge instances.
|
|
146
|
+
|
|
147
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
148
|
+
:returns: Estimated cost in USD
|
|
149
|
+
"""
|
|
150
|
+
# Typical EMR cluster: 5x m5.xlarge @ ~$0.24/hr each = ~$1.20/hr total
|
|
151
|
+
hourly_cost = 1.20
|
|
152
|
+
hours = duration_minutes / 60.0
|
|
153
|
+
return round(hourly_cost * hours, 2)
|
|
154
|
+
|
|
155
|
+
def get_platform_name(self) -> str:
|
|
156
|
+
"""Get platform name."""
|
|
157
|
+
return "emr"
|
|
158
|
+
|
|
159
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
160
|
+
"""
|
|
161
|
+
Get Spark config for JDBC JAR provisioning using Maven coordinates.
|
|
162
|
+
|
|
163
|
+
EMR clusters download JDBC drivers from Maven Central at session startup.
|
|
164
|
+
|
|
165
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
166
|
+
:returns: Dictionary with spark.jars.packages config
|
|
167
|
+
"""
|
|
168
|
+
from dbt.compute.jar_provisioning import RemoteJARProvisioning
|
|
169
|
+
|
|
170
|
+
provisioning = RemoteJARProvisioning()
|
|
171
|
+
return provisioning.get_spark_config(adapter_types)
|
|
172
|
+
|
|
173
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
174
|
+
"""
|
|
175
|
+
Test connectivity to EMR cluster.
|
|
176
|
+
|
|
177
|
+
:returns: Tuple of (success, message)
|
|
178
|
+
"""
|
|
179
|
+
if not PYSPARK_AVAILABLE:
|
|
180
|
+
return (False, "PySpark not installed")
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
spark = self.get_spark_session()
|
|
184
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
185
|
+
return (True, "EMR session created and SQL test passed")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
error_msg = str(e)
|
|
188
|
+
if "Connection refused" in error_msg:
|
|
189
|
+
return (False, "Cannot connect to EMR cluster (connection refused)")
|
|
190
|
+
return (False, f"EMR connection failed: {e}")
|
|
191
|
+
|
|
192
|
+
def get_cluster_info(self) -> Dict[str, Any]:
|
|
193
|
+
"""
|
|
194
|
+
Get information about the EMR configuration.
|
|
195
|
+
|
|
196
|
+
:returns: Dictionary with cluster metadata
|
|
197
|
+
"""
|
|
198
|
+
return {
|
|
199
|
+
"platform": "emr",
|
|
200
|
+
"master": self.config.get("master", "yarn"),
|
|
201
|
+
"deploy_mode": self.config.get("spark.submit.deployMode", "client"),
|
|
202
|
+
"executor_instances": self.config.get("spark.executor.instances", "dynamic"),
|
|
203
|
+
}
|
|
Binary file
|