dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
dbt/config/compute.py
ADDED
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Compute Cluster Registry
|
|
3
|
+
|
|
4
|
+
Manages external compute cluster configurations for DVT.
|
|
5
|
+
|
|
6
|
+
v0.55.0: Computes stored in <project>/.dvt/computes.yml (project-level)
|
|
7
|
+
Managed exclusively via `dvt compute` CLI commands.
|
|
8
|
+
Contains comprehensive commented samples for all platforms.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_project_dvt_dir(project_dir) -> Path:
|
|
22
|
+
"""Get the DVT config directory for a project (<project>/.dvt/).
|
|
23
|
+
|
|
24
|
+
:param project_dir: Path to project root directory (str or Path)
|
|
25
|
+
"""
|
|
26
|
+
return Path(project_dir) / ".dvt"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SparkPlatform(Enum):
|
|
30
|
+
"""Spark platform types for connection strategies.
|
|
31
|
+
|
|
32
|
+
v0.51.2: Removed DATABRICKS (serverless cannot read external JDBC sources).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
LOCAL = "local"
|
|
36
|
+
EMR = "emr"
|
|
37
|
+
DATAPROC = "dataproc"
|
|
38
|
+
STANDALONE = "standalone" # Self-managed Spark clusters (spark://)
|
|
39
|
+
EXTERNAL = "external" # Generic external cluster (fallback)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Default computes.yml template with comprehensive commented samples
|
|
43
|
+
DEFAULT_COMPUTES_YAML = """# ============================================================================
|
|
44
|
+
# DVT Compute Engines Configuration (v0.5.98)
|
|
45
|
+
# ============================================================================
|
|
46
|
+
# This file defines Spark compute engines for federated query execution.
|
|
47
|
+
#
|
|
48
|
+
# Commands:
|
|
49
|
+
# dvt compute test Test connectivity to all compute engines
|
|
50
|
+
# dvt compute edit Open this file in your editor
|
|
51
|
+
# dvt compute validate Validate YAML syntax
|
|
52
|
+
#
|
|
53
|
+
# JDBC JAR Provisioning (v0.5.98):
|
|
54
|
+
# - Local Spark: Uses spark.jars with local file paths (fast startup)
|
|
55
|
+
# - Remote clusters: Uses spark.jars.packages with Maven coordinates
|
|
56
|
+
# (workers download JARs from Maven Central at session start)
|
|
57
|
+
#
|
|
58
|
+
# Platform Detection:
|
|
59
|
+
# DVT auto-detects the platform from config keys:
|
|
60
|
+
# - Dataproc: project + region + cluster
|
|
61
|
+
# - EMR: master=yarn (without Dataproc keys)
|
|
62
|
+
# - Standalone: master=spark://...
|
|
63
|
+
# - Local: master=local[*] or no master
|
|
64
|
+
# ============================================================================
|
|
65
|
+
|
|
66
|
+
# Default compute engine (must match a name in 'computes' section)
|
|
67
|
+
target_compute: spark-local
|
|
68
|
+
|
|
69
|
+
# ============================================================================
|
|
70
|
+
# COMPUTE ENGINES
|
|
71
|
+
# ============================================================================
|
|
72
|
+
# Each compute engine must have:
|
|
73
|
+
# - type: 'spark' (currently the only supported type)
|
|
74
|
+
# - config: Spark configuration options
|
|
75
|
+
# - description: (optional) Human-readable description
|
|
76
|
+
# ============================================================================
|
|
77
|
+
|
|
78
|
+
computes:
|
|
79
|
+
|
|
80
|
+
# --------------------------------------------------------------------------
|
|
81
|
+
# LOCAL SPARK (Default - Works out of the box)
|
|
82
|
+
# --------------------------------------------------------------------------
|
|
83
|
+
# Embedded PySpark for development and small-medium datasets.
|
|
84
|
+
# Uses spark.jars with local file paths for fast startup.
|
|
85
|
+
# JDBC JARs are auto-discovered from profiles.yml connections.
|
|
86
|
+
#
|
|
87
|
+
# Cost: Free (runs on your local machine)
|
|
88
|
+
# Best for: Development, testing, datasets < 10GB
|
|
89
|
+
# --------------------------------------------------------------------------
|
|
90
|
+
spark-local:
|
|
91
|
+
type: spark
|
|
92
|
+
description: "Local Spark for development and testing"
|
|
93
|
+
config:
|
|
94
|
+
master: "local[2]" # Use 2 CPU cores (local[*] for all)
|
|
95
|
+
spark.driver.memory: "2g" # Driver memory
|
|
96
|
+
spark.executor.memory: "2g" # Executor memory
|
|
97
|
+
spark.ui.enabled: "false" # Disable Spark UI
|
|
98
|
+
spark.ui.showConsoleProgress: "false" # No progress bars
|
|
99
|
+
# Spark 4.0 legacy compatibility flags
|
|
100
|
+
spark.sql.legacy.postgres.datetimeMapping.enabled: "true"
|
|
101
|
+
spark.sql.legacy.mysql.timestampNTZMapping.enabled: "true"
|
|
102
|
+
spark.sql.legacy.oracle.timestampMapping.enabled: "true"
|
|
103
|
+
spark.sql.legacy.mssqlserver.numericMapping.enabled: "true"
|
|
104
|
+
# Performance optimizations
|
|
105
|
+
spark.sql.shuffle.partitions: "8"
|
|
106
|
+
spark.sql.execution.arrow.pyspark.enabled: "true"
|
|
107
|
+
spark.sql.execution.arrow.pyspark.fallback.enabled: "true"
|
|
108
|
+
spark.sql.adaptive.enabled: "true"
|
|
109
|
+
spark.sql.adaptive.coalescePartitions.enabled: "true"
|
|
110
|
+
|
|
111
|
+
# --------------------------------------------------------------------------
|
|
112
|
+
# AWS EMR (Elastic MapReduce)
|
|
113
|
+
# --------------------------------------------------------------------------
|
|
114
|
+
# Connects to AWS EMR clusters via YARN.
|
|
115
|
+
# JDBC drivers are provisioned via spark.jars.packages (Maven).
|
|
116
|
+
#
|
|
117
|
+
# Requirements:
|
|
118
|
+
# - AWS credentials configured (aws configure or IAM role)
|
|
119
|
+
# - EMR cluster must be running
|
|
120
|
+
# - Network access to EMR master node
|
|
121
|
+
#
|
|
122
|
+
# Cost: ~$1.20/hr (typical 5-node m5.xlarge cluster)
|
|
123
|
+
# Best for: AWS-native workloads, S3 data integration
|
|
124
|
+
# --------------------------------------------------------------------------
|
|
125
|
+
# emr-cluster:
|
|
126
|
+
# type: spark
|
|
127
|
+
# description: "AWS EMR Spark Cluster"
|
|
128
|
+
# config:
|
|
129
|
+
# master: "yarn" # Required: YARN resource manager
|
|
130
|
+
# spark.submit.deployMode: "client" # Client mode for interactive
|
|
131
|
+
# spark.driver.memory: "4g"
|
|
132
|
+
# spark.executor.memory: "8g"
|
|
133
|
+
# spark.executor.instances: "4"
|
|
134
|
+
# spark.dynamicAllocation.enabled: "true"
|
|
135
|
+
|
|
136
|
+
# --------------------------------------------------------------------------
|
|
137
|
+
# GCP DATAPROC (Google Cloud Spark)
|
|
138
|
+
# --------------------------------------------------------------------------
|
|
139
|
+
# Connects to GCP Dataproc clusters via YARN.
|
|
140
|
+
# JDBC drivers are provisioned via spark.jars.packages (Maven).
|
|
141
|
+
#
|
|
142
|
+
# Requirements:
|
|
143
|
+
# - gcloud SDK configured (gcloud auth login)
|
|
144
|
+
# - Dataproc cluster must be running
|
|
145
|
+
# - Network access to Dataproc master
|
|
146
|
+
#
|
|
147
|
+
# Cost: ~$1.15/hr (typical 5-node n1-standard-4 cluster)
|
|
148
|
+
# Best for: GCP-native workloads, BigQuery/GCS integration
|
|
149
|
+
# --------------------------------------------------------------------------
|
|
150
|
+
# dataproc-cluster:
|
|
151
|
+
# type: spark
|
|
152
|
+
# description: "GCP Dataproc Cluster"
|
|
153
|
+
# config:
|
|
154
|
+
# project: "my-gcp-project" # Required: GCP project ID
|
|
155
|
+
# region: "us-central1" # Required: Dataproc region
|
|
156
|
+
# cluster: "my-dataproc-cluster" # Required: Cluster name
|
|
157
|
+
# spark.driver.memory: "4g"
|
|
158
|
+
# spark.executor.memory: "8g"
|
|
159
|
+
# spark.dynamicAllocation.enabled: "true"
|
|
160
|
+
|
|
161
|
+
# --------------------------------------------------------------------------
|
|
162
|
+
# STANDALONE SPARK CLUSTER
|
|
163
|
+
# --------------------------------------------------------------------------
|
|
164
|
+
# Connects to self-managed Spark clusters (on-premises or cloud VMs).
|
|
165
|
+
# JDBC drivers are provisioned via spark.jars.packages (Maven).
|
|
166
|
+
# Workers download JARs from Maven Central at session start.
|
|
167
|
+
#
|
|
168
|
+
# Requirements:
|
|
169
|
+
# - Spark master accessible at spark://host:port
|
|
170
|
+
# - Workers must have network access to Maven Central
|
|
171
|
+
#
|
|
172
|
+
# Cost: Infrastructure-dependent (your own hardware/VMs)
|
|
173
|
+
# Best for: On-premises deployments, custom Spark configurations
|
|
174
|
+
# --------------------------------------------------------------------------
|
|
175
|
+
# spark-cluster:
|
|
176
|
+
# type: spark
|
|
177
|
+
# description: "Standalone Spark Cluster"
|
|
178
|
+
# config:
|
|
179
|
+
# master: "spark://master-node:7077" # Required: Spark master URL
|
|
180
|
+
# spark.driver.memory: "4g"
|
|
181
|
+
# spark.executor.memory: "8g"
|
|
182
|
+
# spark.executor.cores: "4"
|
|
183
|
+
# spark.executor.instances: "10"
|
|
184
|
+
|
|
185
|
+
# --------------------------------------------------------------------------
|
|
186
|
+
# HIGH-MEMORY LOCAL SPARK
|
|
187
|
+
# --------------------------------------------------------------------------
|
|
188
|
+
# For larger local workloads (requires more system RAM).
|
|
189
|
+
# Same JAR provisioning as spark-local (local file paths).
|
|
190
|
+
#
|
|
191
|
+
# Cost: Free (runs on your local machine)
|
|
192
|
+
# Best for: Larger datasets on powerful workstations
|
|
193
|
+
# --------------------------------------------------------------------------
|
|
194
|
+
# spark-local-large:
|
|
195
|
+
# type: spark
|
|
196
|
+
# description: "High-memory local Spark for large datasets"
|
|
197
|
+
# config:
|
|
198
|
+
# master: "local[*]" # Use all available cores
|
|
199
|
+
# spark.driver.memory: "8g"
|
|
200
|
+
# spark.executor.memory: "8g"
|
|
201
|
+
# spark.sql.shuffle.partitions: "200"
|
|
202
|
+
# spark.sql.adaptive.enabled: "true"
|
|
203
|
+
# spark.sql.adaptive.coalescePartitions.enabled: "true"
|
|
204
|
+
# spark.sql.adaptive.skewJoin.enabled: "true"
|
|
205
|
+
# spark.memory.fraction: "0.8"
|
|
206
|
+
# spark.memory.storageFraction: "0.3"
|
|
207
|
+
|
|
208
|
+
# ============================================================================
|
|
209
|
+
# CONFIGURATION REFERENCE
|
|
210
|
+
# ============================================================================
|
|
211
|
+
# Common Spark configurations:
|
|
212
|
+
#
|
|
213
|
+
# Memory:
|
|
214
|
+
# spark.driver.memory: "4g" # Driver memory (default 1g)
|
|
215
|
+
# spark.executor.memory: "4g" # Executor memory (default 1g)
|
|
216
|
+
# spark.memory.fraction: "0.6" # Fraction for execution/storage
|
|
217
|
+
#
|
|
218
|
+
# Parallelism:
|
|
219
|
+
# spark.executor.cores: "4" # Cores per executor
|
|
220
|
+
# spark.executor.instances: "4" # Number of executors
|
|
221
|
+
# spark.sql.shuffle.partitions: "200" # Shuffle partitions
|
|
222
|
+
# spark.default.parallelism: "100" # Default parallelism
|
|
223
|
+
#
|
|
224
|
+
# Arrow (PyArrow integration):
|
|
225
|
+
# spark.sql.execution.arrow.pyspark.enabled: "true"
|
|
226
|
+
# spark.sql.execution.arrow.maxRecordsPerBatch: "10000"
|
|
227
|
+
#
|
|
228
|
+
# Adaptive Query Execution (Spark 3.0+):
|
|
229
|
+
# spark.sql.adaptive.enabled: "true"
|
|
230
|
+
# spark.sql.adaptive.coalescePartitions.enabled: "true"
|
|
231
|
+
# spark.sql.adaptive.skewJoin.enabled: "true"
|
|
232
|
+
#
|
|
233
|
+
# JDBC JAR Provisioning (v0.5.98):
|
|
234
|
+
# Local Spark:
|
|
235
|
+
# - Uses spark.jars with local file paths
|
|
236
|
+
# - Fast startup (no download needed)
|
|
237
|
+
# - JARs auto-discovered from profiles.yml
|
|
238
|
+
#
|
|
239
|
+
# Remote Clusters (EMR, Dataproc, Standalone):
|
|
240
|
+
# - Uses spark.jars.packages with Maven coordinates
|
|
241
|
+
# - Workers download JARs at session start
|
|
242
|
+
# - Supported databases: PostgreSQL, MySQL, Oracle, SQL Server,
|
|
243
|
+
# Snowflake, Redshift, BigQuery, Teradata, DB2, and 30+ more
|
|
244
|
+
# ============================================================================
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@dataclass
|
|
249
|
+
class ComputeCluster:
|
|
250
|
+
"""Configuration for an external compute cluster."""
|
|
251
|
+
|
|
252
|
+
name: str # Cluster identifier
|
|
253
|
+
type: str # 'spark' (currently only Spark supported for external)
|
|
254
|
+
config: Dict[str, Any] = field(default_factory=dict) # Cluster-specific config
|
|
255
|
+
description: Optional[str] = None
|
|
256
|
+
cost_per_hour: Optional[float] = None # Estimated cost per hour (USD)
|
|
257
|
+
|
|
258
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
259
|
+
"""Serialize to dictionary."""
|
|
260
|
+
result = {
|
|
261
|
+
"type": self.type,
|
|
262
|
+
"config": self.config,
|
|
263
|
+
}
|
|
264
|
+
if self.description:
|
|
265
|
+
result["description"] = self.description
|
|
266
|
+
if self.cost_per_hour is not None:
|
|
267
|
+
result["cost_per_hour"] = self.cost_per_hour
|
|
268
|
+
return result
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def from_dict(cls, name: str, data: Dict[str, Any]) -> "ComputeCluster":
|
|
272
|
+
"""Deserialize from dictionary."""
|
|
273
|
+
return cls(
|
|
274
|
+
name=name,
|
|
275
|
+
type=data.get("type", "spark"),
|
|
276
|
+
config=data.get("config", {}),
|
|
277
|
+
description=data.get("description"),
|
|
278
|
+
cost_per_hour=data.get("cost_per_hour"),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
def detect_platform(self) -> SparkPlatform:
|
|
282
|
+
"""
|
|
283
|
+
Detect Spark platform from configuration keys.
|
|
284
|
+
|
|
285
|
+
v0.51.2: Removed Databricks support.
|
|
286
|
+
Detection order (most specific first):
|
|
287
|
+
1. Dataproc: project + region + cluster
|
|
288
|
+
2. EMR: master=yarn (without Dataproc keys)
|
|
289
|
+
3. Standalone: master=spark://
|
|
290
|
+
4. Local: master=local[*] or no master
|
|
291
|
+
5. External: fallback for unknown configurations
|
|
292
|
+
|
|
293
|
+
:returns: SparkPlatform enum value
|
|
294
|
+
"""
|
|
295
|
+
if self.type != "spark":
|
|
296
|
+
return SparkPlatform.EXTERNAL
|
|
297
|
+
|
|
298
|
+
config_keys = set(self.config.keys())
|
|
299
|
+
|
|
300
|
+
# 1. Dataproc: has project, region, and cluster
|
|
301
|
+
if all(k in config_keys for k in ["project", "region", "cluster"]):
|
|
302
|
+
return SparkPlatform.DATAPROC
|
|
303
|
+
|
|
304
|
+
# Check master value for remaining platforms
|
|
305
|
+
if "master" in config_keys:
|
|
306
|
+
master = str(self.config["master"]).lower()
|
|
307
|
+
|
|
308
|
+
# 2. EMR: master=yarn (without Dataproc keys)
|
|
309
|
+
if master == "yarn":
|
|
310
|
+
return SparkPlatform.EMR
|
|
311
|
+
|
|
312
|
+
# 3. Standalone: master=spark://
|
|
313
|
+
if master.startswith("spark://"):
|
|
314
|
+
return SparkPlatform.STANDALONE
|
|
315
|
+
|
|
316
|
+
# 4. Local: master=local[*]
|
|
317
|
+
if master.startswith("local"):
|
|
318
|
+
return SparkPlatform.LOCAL
|
|
319
|
+
|
|
320
|
+
# 5. External: unknown master format
|
|
321
|
+
return SparkPlatform.EXTERNAL
|
|
322
|
+
|
|
323
|
+
# Default to local (no master specified)
|
|
324
|
+
return SparkPlatform.LOCAL
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class ComputeRegistry:
|
|
328
|
+
"""
|
|
329
|
+
Registry for managing external compute clusters.
|
|
330
|
+
|
|
331
|
+
v0.55.0: Clusters stored in <project>/.dvt/computes.yml (project-level)
|
|
332
|
+
Managed exclusively via `dvt compute` CLI commands.
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
def __init__(self, project_dir=None):
|
|
336
|
+
"""
|
|
337
|
+
Initialize compute registry.
|
|
338
|
+
|
|
339
|
+
:param project_dir: Path to project root directory (str or Path)
|
|
340
|
+
"""
|
|
341
|
+
self.project_dir = str(project_dir) if project_dir else os.getcwd()
|
|
342
|
+
|
|
343
|
+
# v0.55.0: Project-level paths
|
|
344
|
+
self.project_dvt_dir = get_project_dvt_dir(self.project_dir)
|
|
345
|
+
self.compute_file = self.project_dvt_dir / "computes.yml"
|
|
346
|
+
self.jdbc_jars_dir = self.project_dvt_dir / "jdbc_jars"
|
|
347
|
+
|
|
348
|
+
self._clusters: Dict[str, ComputeCluster] = {}
|
|
349
|
+
self._target_compute: Optional[str] = None
|
|
350
|
+
self._load()
|
|
351
|
+
|
|
352
|
+
def _load(self) -> None:
|
|
353
|
+
"""Load clusters from storage.
|
|
354
|
+
|
|
355
|
+
v0.55.0: Only project-level <project>/.dvt/computes.yml is supported.
|
|
356
|
+
"""
|
|
357
|
+
# Load from project-level YAML file if it exists
|
|
358
|
+
if self.compute_file.exists():
|
|
359
|
+
self._load_from_yaml()
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
# No file exists - load defaults (will be saved when ensure_config_exists is called)
|
|
363
|
+
self._load_defaults()
|
|
364
|
+
|
|
365
|
+
def _load_from_yaml(self) -> None:
|
|
366
|
+
"""Load clusters from YAML file."""
|
|
367
|
+
try:
|
|
368
|
+
with open(self.compute_file, "r") as f:
|
|
369
|
+
data = yaml.safe_load(f)
|
|
370
|
+
|
|
371
|
+
if not data:
|
|
372
|
+
self._load_defaults()
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
# Parse target_compute (default compute engine)
|
|
376
|
+
self._target_compute = data.get("target_compute", "spark-local")
|
|
377
|
+
|
|
378
|
+
# Parse computes
|
|
379
|
+
computes_data = data.get("computes", {})
|
|
380
|
+
for name, cluster_data in computes_data.items():
|
|
381
|
+
if cluster_data: # Skip None/empty entries
|
|
382
|
+
cluster = ComputeCluster.from_dict(name, cluster_data)
|
|
383
|
+
self._clusters[cluster.name] = cluster
|
|
384
|
+
|
|
385
|
+
# If no computes defined, use defaults
|
|
386
|
+
if not self._clusters:
|
|
387
|
+
self._load_defaults()
|
|
388
|
+
|
|
389
|
+
except Exception as e:
|
|
390
|
+
raise DbtRuntimeError(f"Failed to load compute registry: {str(e)}") from e
|
|
391
|
+
|
|
392
|
+
def _load_defaults(self) -> None:
|
|
393
|
+
"""Load default out-of-box compute engines."""
|
|
394
|
+
data = yaml.safe_load(DEFAULT_COMPUTES_YAML)
|
|
395
|
+
|
|
396
|
+
self._target_compute = data.get("target_compute", "spark-local")
|
|
397
|
+
|
|
398
|
+
computes_data = data.get("computes", {})
|
|
399
|
+
for name, cluster_data in computes_data.items():
|
|
400
|
+
if cluster_data: # Skip None entries (commented out samples)
|
|
401
|
+
cluster = ComputeCluster.from_dict(name, cluster_data)
|
|
402
|
+
self._clusters[cluster.name] = cluster
|
|
403
|
+
|
|
404
|
+
def _save(self) -> None:
|
|
405
|
+
"""Save clusters to YAML file at project-level."""
|
|
406
|
+
# Ensure project .dvt directory exists
|
|
407
|
+
self.project_dvt_dir.mkdir(parents=True, exist_ok=True)
|
|
408
|
+
|
|
409
|
+
# Build the YAML content with active computes
|
|
410
|
+
computes_dict = {}
|
|
411
|
+
for cluster in self._clusters.values():
|
|
412
|
+
computes_dict[cluster.name] = cluster.to_dict()
|
|
413
|
+
|
|
414
|
+
# If file exists, try to preserve comments by updating only the active section
|
|
415
|
+
# For simplicity, we'll write the full template with active computes
|
|
416
|
+
yaml_content = f"""# ============================================================================
|
|
417
|
+
# DVT Compute Engines Configuration
|
|
418
|
+
# ============================================================================
|
|
419
|
+
# This file defines Spark compute engines for federated query execution.
|
|
420
|
+
# Edit with: dvt compute edit
|
|
421
|
+
# Validate with: dvt compute validate
|
|
422
|
+
# Test with: dvt compute test
|
|
423
|
+
# ============================================================================
|
|
424
|
+
|
|
425
|
+
# Default compute engine (must match a name in 'computes' section)
|
|
426
|
+
target_compute: {self._target_compute or 'spark-local'}
|
|
427
|
+
|
|
428
|
+
computes:
|
|
429
|
+
"""
|
|
430
|
+
# Add active computes
|
|
431
|
+
for name, cluster in self._clusters.items():
|
|
432
|
+
yaml_content += f"\n {name}:\n"
|
|
433
|
+
yaml_content += f" type: {cluster.type}\n"
|
|
434
|
+
if cluster.description:
|
|
435
|
+
yaml_content += f' description: "{cluster.description}"\n'
|
|
436
|
+
yaml_content += " config:\n"
|
|
437
|
+
for key, value in cluster.config.items():
|
|
438
|
+
yaml_content += f' {key}: "{value}"\n'
|
|
439
|
+
|
|
440
|
+
with open(self.compute_file, "w") as f:
|
|
441
|
+
f.write(yaml_content)
|
|
442
|
+
|
|
443
|
+
def get_config_path(self) -> Path:
|
|
444
|
+
"""Get the path to the computes.yml file."""
|
|
445
|
+
return self.compute_file
|
|
446
|
+
|
|
447
|
+
def ensure_config_exists(self) -> Path:
|
|
448
|
+
"""Ensure the config file exists at project-level and return its path."""
|
|
449
|
+
if not self.compute_file.exists():
|
|
450
|
+
self._load_defaults()
|
|
451
|
+
# Write full template with samples to project-level
|
|
452
|
+
self.project_dvt_dir.mkdir(parents=True, exist_ok=True)
|
|
453
|
+
with open(self.compute_file, "w") as f:
|
|
454
|
+
f.write(DEFAULT_COMPUTES_YAML)
|
|
455
|
+
return self.compute_file
|
|
456
|
+
|
|
457
|
+
@property
|
|
458
|
+
def target_compute(self) -> str:
|
|
459
|
+
"""Get the default target compute engine."""
|
|
460
|
+
return self._target_compute or "spark-local"
|
|
461
|
+
|
|
462
|
+
@target_compute.setter
|
|
463
|
+
def target_compute(self, value: str) -> None:
|
|
464
|
+
"""Set the default target compute engine."""
|
|
465
|
+
if value not in self._clusters:
|
|
466
|
+
raise DbtRuntimeError(
|
|
467
|
+
f"Cannot set target_compute to '{value}': compute engine not found. "
|
|
468
|
+
f"Available engines: {', '.join(self._clusters.keys())}"
|
|
469
|
+
)
|
|
470
|
+
self._target_compute = value
|
|
471
|
+
self._save()
|
|
472
|
+
|
|
473
|
+
def get(self, name: str) -> Optional[ComputeCluster]:
|
|
474
|
+
"""
|
|
475
|
+
Get a compute cluster by name.
|
|
476
|
+
|
|
477
|
+
:param name: Cluster name
|
|
478
|
+
:returns: ComputeCluster or None if not found
|
|
479
|
+
"""
|
|
480
|
+
return self._clusters.get(name)
|
|
481
|
+
|
|
482
|
+
def list(self) -> List[ComputeCluster]:
|
|
483
|
+
"""
|
|
484
|
+
List all registered clusters.
|
|
485
|
+
|
|
486
|
+
:returns: List of ComputeCluster objects
|
|
487
|
+
"""
|
|
488
|
+
return list(self._clusters.values())
|
|
489
|
+
|
|
490
|
+
def exists(self, name: str) -> bool:
|
|
491
|
+
"""
|
|
492
|
+
Check if a cluster exists.
|
|
493
|
+
|
|
494
|
+
:param name: Cluster name
|
|
495
|
+
:returns: True if cluster exists
|
|
496
|
+
"""
|
|
497
|
+
return name in self._clusters
|
|
498
|
+
|
|
499
|
+
@staticmethod
|
|
500
|
+
def ensure_jdbc_jars_dir(project_dir: str) -> Path:
|
|
501
|
+
"""
|
|
502
|
+
Ensure the project-level .dvt/jdbc_jars/ directory exists.
|
|
503
|
+
|
|
504
|
+
:param project_dir: Path to project root directory
|
|
505
|
+
:returns: Path to the jdbc_jars directory
|
|
506
|
+
"""
|
|
507
|
+
jdbc_jars_dir = get_project_dvt_dir(project_dir) / "jdbc_jars"
|
|
508
|
+
jdbc_jars_dir.mkdir(parents=True, exist_ok=True)
|
|
509
|
+
return jdbc_jars_dir
|
|
510
|
+
|
|
511
|
+
def get_jdbc_jars_dir(self) -> Path:
|
|
512
|
+
"""Get the project-level jdbc_jars directory path."""
|
|
513
|
+
return self.jdbc_jars_dir
|
|
Binary file
|