PyPI - dvt-core - Versions diffs - 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl - Mend

dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (275) hide show

dbt/__init__.py +7 -0
dbt/_pydantic_shim.py +26 -0
dbt/artifacts/__init__.py +0 -0
dbt/artifacts/exceptions/__init__.py +1 -0
dbt/artifacts/exceptions/schemas.py +31 -0
dbt/artifacts/resources/__init__.py +116 -0
dbt/artifacts/resources/base.py +67 -0
dbt/artifacts/resources/types.py +93 -0
dbt/artifacts/resources/v1/analysis.py +10 -0
dbt/artifacts/resources/v1/catalog.py +23 -0
dbt/artifacts/resources/v1/components.py +274 -0
dbt/artifacts/resources/v1/config.py +277 -0
dbt/artifacts/resources/v1/documentation.py +11 -0
dbt/artifacts/resources/v1/exposure.py +51 -0
dbt/artifacts/resources/v1/function.py +52 -0
dbt/artifacts/resources/v1/generic_test.py +31 -0
dbt/artifacts/resources/v1/group.py +21 -0
dbt/artifacts/resources/v1/hook.py +11 -0
dbt/artifacts/resources/v1/macro.py +29 -0
dbt/artifacts/resources/v1/metric.py +172 -0
dbt/artifacts/resources/v1/model.py +145 -0
dbt/artifacts/resources/v1/owner.py +10 -0
dbt/artifacts/resources/v1/saved_query.py +111 -0
dbt/artifacts/resources/v1/seed.py +41 -0
dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
dbt/artifacts/resources/v1/semantic_model.py +314 -0
dbt/artifacts/resources/v1/singular_test.py +14 -0
dbt/artifacts/resources/v1/snapshot.py +91 -0
dbt/artifacts/resources/v1/source_definition.py +84 -0
dbt/artifacts/resources/v1/sql_operation.py +10 -0
dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
dbt/artifacts/schemas/__init__.py +0 -0
dbt/artifacts/schemas/base.py +191 -0
dbt/artifacts/schemas/batch_results.py +24 -0
dbt/artifacts/schemas/catalog/__init__.py +11 -0
dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
dbt/artifacts/schemas/freshness/__init__.py +1 -0
dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
dbt/artifacts/schemas/manifest/__init__.py +2 -0
dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
dbt/artifacts/schemas/results.py +147 -0
dbt/artifacts/schemas/run/__init__.py +2 -0
dbt/artifacts/schemas/run/v5/__init__.py +0 -0
dbt/artifacts/schemas/run/v5/run.py +184 -0
dbt/artifacts/schemas/upgrades/__init__.py +4 -0
dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
dbt/artifacts/utils/validation.py +153 -0
dbt/cli/__init__.py +1 -0
dbt/cli/context.py +17 -0
dbt/cli/exceptions.py +57 -0
dbt/cli/flags.py +560 -0
dbt/cli/main.py +2039 -0
dbt/cli/option_types.py +121 -0
dbt/cli/options.py +80 -0
dbt/cli/params.py +804 -0
dbt/cli/requires.py +490 -0
dbt/cli/resolvers.py +50 -0
dbt/cli/types.py +40 -0
dbt/clients/__init__.py +0 -0
dbt/clients/checked_load.py +83 -0
dbt/clients/git.py +164 -0
dbt/clients/jinja.py +206 -0
dbt/clients/jinja_static.py +245 -0
dbt/clients/registry.py +192 -0
dbt/clients/yaml_helper.py +68 -0
dbt/compilation.py +876 -0
dbt/compute/__init__.py +14 -0
dbt/compute/engines/__init__.py +12 -0
dbt/compute/engines/spark_engine.py +624 -0
dbt/compute/federated_executor.py +837 -0
dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
dbt/compute/filter_pushdown.py +273 -0
dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
dbt/compute/jar_provisioning.py +255 -0
dbt/compute/java_compat.cpython-310-darwin.so +0 -0
dbt/compute/java_compat.py +689 -0
dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
dbt/compute/jdbc_utils.py +678 -0
dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
dbt/compute/smart_selector.py +311 -0
dbt/compute/strategies/__init__.py +54 -0
dbt/compute/strategies/base.py +165 -0
dbt/compute/strategies/dataproc.py +207 -0
dbt/compute/strategies/emr.py +203 -0
dbt/compute/strategies/local.py +364 -0
dbt/compute/strategies/standalone.py +262 -0
dbt/config/__init__.py +4 -0
dbt/config/catalogs.py +94 -0
dbt/config/compute.cpython-310-darwin.so +0 -0
dbt/config/compute.py +547 -0
dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
dbt/config/dvt_profile.py +342 -0
dbt/config/profile.py +422 -0
dbt/config/project.py +873 -0
dbt/config/project_utils.py +28 -0
dbt/config/renderer.py +231 -0
dbt/config/runtime.py +553 -0
dbt/config/selectors.py +208 -0
dbt/config/utils.py +77 -0
dbt/constants.py +28 -0
dbt/context/__init__.py +0 -0
dbt/context/base.py +745 -0
dbt/context/configured.py +135 -0
dbt/context/context_config.py +382 -0
dbt/context/docs.py +82 -0
dbt/context/exceptions_jinja.py +178 -0
dbt/context/macro_resolver.py +195 -0
dbt/context/macros.py +171 -0
dbt/context/manifest.py +72 -0
dbt/context/providers.py +2249 -0
dbt/context/query_header.py +13 -0
dbt/context/secret.py +58 -0
dbt/context/target.py +74 -0
dbt/contracts/__init__.py +0 -0
dbt/contracts/files.py +413 -0
dbt/contracts/graph/__init__.py +0 -0
dbt/contracts/graph/manifest.py +1904 -0
dbt/contracts/graph/metrics.py +97 -0
dbt/contracts/graph/model_config.py +70 -0
dbt/contracts/graph/node_args.py +42 -0
dbt/contracts/graph/nodes.py +1806 -0
dbt/contracts/graph/semantic_manifest.py +232 -0
dbt/contracts/graph/unparsed.py +811 -0
dbt/contracts/project.py +417 -0
dbt/contracts/results.py +53 -0
dbt/contracts/selection.py +23 -0
dbt/contracts/sql.py +85 -0
dbt/contracts/state.py +68 -0
dbt/contracts/util.py +46 -0
dbt/deprecations.py +346 -0
dbt/deps/__init__.py +0 -0
dbt/deps/base.py +152 -0
dbt/deps/git.py +195 -0
dbt/deps/local.py +79 -0
dbt/deps/registry.py +130 -0
dbt/deps/resolver.py +149 -0
dbt/deps/tarball.py +120 -0
dbt/docs/source/_ext/dbt_click.py +119 -0
dbt/docs/source/conf.py +32 -0
dbt/env_vars.py +64 -0
dbt/event_time/event_time.py +40 -0
dbt/event_time/sample_window.py +60 -0
dbt/events/__init__.py +15 -0
dbt/events/base_types.py +36 -0
dbt/events/core_types_pb2.py +2 -0
dbt/events/logging.py +108 -0
dbt/events/types.py +2516 -0
dbt/exceptions.py +1486 -0
dbt/flags.py +89 -0
dbt/graph/__init__.py +11 -0
dbt/graph/cli.py +247 -0
dbt/graph/graph.py +172 -0
dbt/graph/queue.py +214 -0
dbt/graph/selector.py +374 -0
dbt/graph/selector_methods.py +975 -0
dbt/graph/selector_spec.py +222 -0
dbt/graph/thread_pool.py +18 -0
dbt/hooks.py +21 -0
dbt/include/README.md +49 -0
dbt/include/__init__.py +3 -0
dbt/include/starter_project/.gitignore +4 -0
dbt/include/starter_project/README.md +15 -0
dbt/include/starter_project/__init__.py +3 -0
dbt/include/starter_project/analyses/.gitkeep +0 -0
dbt/include/starter_project/dbt_project.yml +36 -0
dbt/include/starter_project/macros/.gitkeep +0 -0
dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
dbt/include/starter_project/models/example/schema.yml +21 -0
dbt/include/starter_project/seeds/.gitkeep +0 -0
dbt/include/starter_project/snapshots/.gitkeep +0 -0
dbt/include/starter_project/tests/.gitkeep +0 -0
dbt/internal_deprecations.py +26 -0
dbt/jsonschemas/__init__.py +3 -0
dbt/jsonschemas/jsonschemas.py +309 -0
dbt/jsonschemas/project/0.0.110.json +4717 -0
dbt/jsonschemas/project/0.0.85.json +2015 -0
dbt/jsonschemas/resources/0.0.110.json +2636 -0
dbt/jsonschemas/resources/0.0.85.json +2536 -0
dbt/jsonschemas/resources/latest.json +6773 -0
dbt/links.py +4 -0
dbt/materializations/__init__.py +0 -0
dbt/materializations/incremental/__init__.py +0 -0
dbt/materializations/incremental/microbatch.py +236 -0
dbt/mp_context.py +8 -0
dbt/node_types.py +37 -0
dbt/parser/__init__.py +23 -0
dbt/parser/analysis.py +21 -0
dbt/parser/base.py +548 -0
dbt/parser/common.py +266 -0
dbt/parser/docs.py +52 -0
dbt/parser/fixtures.py +51 -0
dbt/parser/functions.py +30 -0
dbt/parser/generic_test.py +100 -0
dbt/parser/generic_test_builders.py +333 -0
dbt/parser/hooks.py +118 -0
dbt/parser/macros.py +137 -0
dbt/parser/manifest.py +2204 -0
dbt/parser/models.py +573 -0
dbt/parser/partial.py +1178 -0
dbt/parser/read_files.py +445 -0
dbt/parser/schema_generic_tests.py +422 -0
dbt/parser/schema_renderer.py +111 -0
dbt/parser/schema_yaml_readers.py +935 -0
dbt/parser/schemas.py +1466 -0
dbt/parser/search.py +149 -0
dbt/parser/seeds.py +28 -0
dbt/parser/singular_test.py +20 -0
dbt/parser/snapshots.py +44 -0
dbt/parser/sources.py +558 -0
dbt/parser/sql.py +62 -0
dbt/parser/unit_tests.py +621 -0
dbt/plugins/__init__.py +20 -0
dbt/plugins/contracts.py +9 -0
dbt/plugins/exceptions.py +2 -0
dbt/plugins/manager.py +163 -0
dbt/plugins/manifest.py +21 -0
dbt/profiler.py +20 -0
dbt/py.typed +1 -0
dbt/query_analyzer.cpython-310-darwin.so +0 -0
dbt/query_analyzer.py +410 -0
dbt/runners/__init__.py +2 -0
dbt/runners/exposure_runner.py +7 -0
dbt/runners/no_op_runner.py +45 -0
dbt/runners/saved_query_runner.py +7 -0
dbt/selected_resources.py +8 -0
dbt/task/__init__.py +0 -0
dbt/task/base.py +503 -0
dbt/task/build.py +197 -0
dbt/task/clean.py +56 -0
dbt/task/clone.py +161 -0
dbt/task/compile.py +150 -0
dbt/task/compute.py +454 -0
dbt/task/debug.py +505 -0
dbt/task/deps.py +280 -0
dbt/task/docs/__init__.py +3 -0
dbt/task/docs/generate.py +660 -0
dbt/task/docs/index.html +250 -0
dbt/task/docs/serve.py +29 -0
dbt/task/freshness.py +322 -0
dbt/task/function.py +121 -0
dbt/task/group_lookup.py +46 -0
dbt/task/init.py +553 -0
dbt/task/java.py +316 -0
dbt/task/list.py +236 -0
dbt/task/printer.py +175 -0
dbt/task/retry.py +175 -0
dbt/task/run.py +1306 -0
dbt/task/run_operation.py +141 -0
dbt/task/runnable.py +758 -0
dbt/task/seed.py +103 -0
dbt/task/show.py +149 -0
dbt/task/snapshot.py +56 -0
dbt/task/spark.py +414 -0
dbt/task/sql.py +110 -0
dbt/task/target_sync.py +759 -0
dbt/task/test.py +464 -0
dbt/tests/fixtures/__init__.py +1 -0
dbt/tests/fixtures/project.py +620 -0
dbt/tests/util.py +651 -0
dbt/tracking.py +529 -0
dbt/utils/__init__.py +3 -0
dbt/utils/artifact_upload.py +151 -0
dbt/utils/utils.py +408 -0
dbt/version.py +268 -0
dvt_cli/__init__.py +72 -0
dvt_core-0.52.2.dist-info/METADATA +286 -0
dvt_core-0.52.2.dist-info/RECORD +275 -0
dvt_core-0.52.2.dist-info/WHEEL +5 -0
dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
dvt_core-0.52.2.dist-info/top_level.txt +2 -0

dbt/config/catalogs.py ADDED Viewed

@@ -0,0 +1,94 @@
+import os
+from copy import deepcopy
+from typing import Any, Dict, List, Optional
+from dbt.artifacts.resources import Catalog, CatalogWriteIntegrationConfig
+from dbt.clients.yaml_helper import load_yaml_text
+from dbt.config.renderer import SecretRenderer
+from dbt.constants import CATALOGS_FILE_NAME
+from dbt.exceptions import YamlLoadError
+from dbt_common.clients.system import load_file_contents
+from dbt_common.exceptions import CompilationError, DbtValidationError
+def load_catalogs_yml(project_dir: str, project_name: str) -> Dict[str, Any]:
+    path = os.path.join(project_dir, CATALOGS_FILE_NAME)
+    if os.path.isfile(path):
+        try:
+            contents = load_file_contents(path, strip=False)
+            yaml_content = load_yaml_text(contents)
+            if not yaml_content:
+                raise DbtValidationError(f"The file at {path} is empty")
+            return yaml_content
+        except DbtValidationError as e:
+            raise YamlLoadError(project_name=project_name, path=CATALOGS_FILE_NAME, exc=e)
+    return {}
+def load_single_catalog(raw_catalog: Dict[str, Any], renderer: SecretRenderer) -> Catalog:
+    try:
+        rendered_catalog = renderer.render_data(raw_catalog)
+    except CompilationError as exc:
+        raise DbtValidationError(str(exc)) from exc
+    Catalog.validate(rendered_catalog)
+    write_integrations = []
+    write_integration_names = set()
+    for raw_integration in rendered_catalog.get("write_integrations", []):
+        if raw_integration["name"] in write_integration_names:
+            raise DbtValidationError(
+                f"Catalog '{rendered_catalog['name']}' cannot have multiple 'write_integrations' with the same name: '{raw_integration['name']}'."
+            )
+        # We're going to let the adapter validate the integration config
+        write_integrations.append(
+            CatalogWriteIntegrationConfig(**raw_integration, catalog_name=raw_catalog["name"])
+        )
+        write_integration_names.add(raw_integration["name"])
+    # Validate + set default active_write_integration if unset
+    active_write_integration = rendered_catalog.get("active_write_integration")
+    valid_write_integration_names = [integration.name for integration in write_integrations]
+    if not active_write_integration:
+        if len(valid_write_integration_names) == 1:
+            active_write_integration = write_integrations[0].name
+        else:
+            raise DbtValidationError(
+                f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' when multiple 'write_integrations' are provided."
+            )
+    else:
+        if active_write_integration not in valid_write_integration_names:
+            raise DbtValidationError(
+                f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' from its set of defined 'write_integrations': {valid_write_integration_names}. Got: '{active_write_integration}'."
+            )
+    return Catalog(
+        name=raw_catalog["name"],
+        active_write_integration=active_write_integration,
+        write_integrations=write_integrations,
+    )
+def load_catalogs(project_dir: str, project_name: str, cli_vars: Dict[str, Any]) -> List[Catalog]:
+    raw_catalogs = load_catalogs_yml(project_dir, project_name).get("catalogs", [])
+    catalogs_renderer = SecretRenderer(cli_vars)
+    return [load_single_catalog(raw_catalog, catalogs_renderer) for raw_catalog in raw_catalogs]
+def get_active_write_integration(catalog: Catalog) -> Optional[CatalogWriteIntegrationConfig]:
+    for integration in catalog.write_integrations:
+        if integration.name == catalog.active_write_integration:
+            active_integration = deepcopy(integration)
+            active_integration.catalog_name = active_integration.name
+            active_integration.name = catalog.name
+            return active_integration
+    return None

dbt/config/compute.cpython-310-darwin.so ADDED Viewed

Binary file

dbt/config/compute.py ADDED Viewed

@@ -0,0 +1,547 @@
+"""
+Compute Cluster Registry
+Manages external compute cluster configurations for DVT.
+v0.5.97: Computes stored in ~/.dvt/.data/computes.yml (YAML format)
+         Managed exclusively via `dvt compute` CLI commands.
+         Contains comprehensive commented samples for all platforms.
+"""
+import os
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+from dbt_common.exceptions import DbtRuntimeError
+def get_dvt_dir() -> Path:
+    """Get the DVT config directory (~/.dvt/)."""
+    # Check DVT_PROFILES_DIR env var first (same as profiles)
+    profiles_dir = os.environ.get("DVT_PROFILES_DIR")
+    if profiles_dir:
+        return Path(profiles_dir)
+    # Fall back to ~/.dvt/
+    return Path.home() / ".dvt"
+def get_internal_data_dir() -> Path:
+    """Get the DVT internal data directory (~/.dvt/.data/)."""
+    return get_dvt_dir() / ".data"
+class SparkPlatform(Enum):
+    """Spark platform types for connection strategies.
+    v0.51.2: Removed DATABRICKS (serverless cannot read external JDBC sources).
+    """
+    LOCAL = "local"
+    EMR = "emr"
+    DATAPROC = "dataproc"
+    STANDALONE = "standalone"  # Self-managed Spark clusters (spark://)
+    EXTERNAL = "external"  # Generic external cluster (fallback)
+# Default computes.yml template with comprehensive commented samples
+DEFAULT_COMPUTES_YAML = """# ============================================================================
+# DVT Compute Engines Configuration (v0.5.98)
+# ============================================================================
+# This file defines Spark compute engines for federated query execution.
+#
+# Commands:
+#   dvt compute test      Test connectivity to all compute engines
+#   dvt compute edit      Open this file in your editor
+#   dvt compute validate  Validate YAML syntax
+#
+# JDBC JAR Provisioning (v0.5.98):
+#   - Local Spark: Uses spark.jars with local file paths (fast startup)
+#   - Remote clusters: Uses spark.jars.packages with Maven coordinates
+#     (workers download JARs from Maven Central at session start)
+#
+# Platform Detection:
+#   DVT auto-detects the platform from config keys:
+#   - Dataproc: project + region + cluster
+#   - EMR: master=yarn (without Dataproc keys)
+#   - Standalone: master=spark://...
+#   - Local: master=local[*] or no master
+# ============================================================================
+# Default compute engine (must match a name in 'computes' section)
+target_compute: spark-local
+# ============================================================================
+# COMPUTE ENGINES
+# ============================================================================
+# Each compute engine must have:
+#   - type: 'spark' (currently the only supported type)
+#   - config: Spark configuration options
+#   - description: (optional) Human-readable description
+# ============================================================================
+computes:
+  # --------------------------------------------------------------------------
+  # LOCAL SPARK (Default - Works out of the box)
+  # --------------------------------------------------------------------------
+  # Embedded PySpark for development and small-medium datasets.
+  # Uses spark.jars with local file paths for fast startup.
+  # JDBC JARs are auto-discovered from profiles.yml connections.
+  #
+  # Cost: Free (runs on your local machine)
+  # Best for: Development, testing, datasets < 10GB
+  # --------------------------------------------------------------------------
+  spark-local:
+    type: spark
+    description: "Local Spark for development and testing"
+    config:
+      master: "local[2]"                    # Use 2 CPU cores (local[*] for all)
+      spark.driver.memory: "2g"             # Driver memory
+      spark.executor.memory: "2g"           # Executor memory
+      spark.ui.enabled: "false"             # Disable Spark UI
+      spark.ui.showConsoleProgress: "false" # No progress bars
+      # Spark 4.0 legacy compatibility flags
+      spark.sql.legacy.postgres.datetimeMapping.enabled: "true"
+      spark.sql.legacy.mysql.timestampNTZMapping.enabled: "true"
+      spark.sql.legacy.oracle.timestampMapping.enabled: "true"
+      spark.sql.legacy.mssqlserver.numericMapping.enabled: "true"
+      # Performance optimizations
+      spark.sql.shuffle.partitions: "8"
+      spark.sql.execution.arrow.pyspark.enabled: "true"
+      spark.sql.execution.arrow.pyspark.fallback.enabled: "true"
+      spark.sql.adaptive.enabled: "true"
+      spark.sql.adaptive.coalescePartitions.enabled: "true"
+  # --------------------------------------------------------------------------
+  # AWS EMR (Elastic MapReduce)
+  # --------------------------------------------------------------------------
+  # Connects to AWS EMR clusters via YARN.
+  # JDBC drivers are provisioned via spark.jars.packages (Maven).
+  #
+  # Requirements:
+  #   - AWS credentials configured (aws configure or IAM role)
+  #   - EMR cluster must be running
+  #   - Network access to EMR master node
+  #
+  # Cost: ~$1.20/hr (typical 5-node m5.xlarge cluster)
+  # Best for: AWS-native workloads, S3 data integration
+  # --------------------------------------------------------------------------
+  # emr-cluster:
+  #   type: spark
+  #   description: "AWS EMR Spark Cluster"
+  #   config:
+  #     master: "yarn"                      # Required: YARN resource manager
+  #     spark.submit.deployMode: "client"   # Client mode for interactive
+  #     spark.driver.memory: "4g"
+  #     spark.executor.memory: "8g"
+  #     spark.executor.instances: "4"
+  #     spark.dynamicAllocation.enabled: "true"
+  # --------------------------------------------------------------------------
+  # GCP DATAPROC (Google Cloud Spark)
+  # --------------------------------------------------------------------------
+  # Connects to GCP Dataproc clusters via YARN.
+  # JDBC drivers are provisioned via spark.jars.packages (Maven).
+  #
+  # Requirements:
+  #   - gcloud SDK configured (gcloud auth login)
+  #   - Dataproc cluster must be running
+  #   - Network access to Dataproc master
+  #
+  # Cost: ~$1.15/hr (typical 5-node n1-standard-4 cluster)
+  # Best for: GCP-native workloads, BigQuery/GCS integration
+  # --------------------------------------------------------------------------
+  # dataproc-cluster:
+  #   type: spark
+  #   description: "GCP Dataproc Cluster"
+  #   config:
+  #     project: "my-gcp-project"           # Required: GCP project ID
+  #     region: "us-central1"               # Required: Dataproc region
+  #     cluster: "my-dataproc-cluster"      # Required: Cluster name
+  #     spark.driver.memory: "4g"
+  #     spark.executor.memory: "8g"
+  #     spark.dynamicAllocation.enabled: "true"
+  # --------------------------------------------------------------------------
+  # STANDALONE SPARK CLUSTER
+  # --------------------------------------------------------------------------
+  # Connects to self-managed Spark clusters (on-premises or cloud VMs).
+  # JDBC drivers are provisioned via spark.jars.packages (Maven).
+  # Workers download JARs from Maven Central at session start.
+  #
+  # Requirements:
+  #   - Spark master accessible at spark://host:port
+  #   - Workers must have network access to Maven Central
+  #
+  # Cost: Infrastructure-dependent (your own hardware/VMs)
+  # Best for: On-premises deployments, custom Spark configurations
+  # --------------------------------------------------------------------------
+  # spark-cluster:
+  #   type: spark
+  #   description: "Standalone Spark Cluster"
+  #   config:
+  #     master: "spark://master-node:7077"  # Required: Spark master URL
+  #     spark.driver.memory: "4g"
+  #     spark.executor.memory: "8g"
+  #     spark.executor.cores: "4"
+  #     spark.executor.instances: "10"
+  # --------------------------------------------------------------------------
+  # HIGH-MEMORY LOCAL SPARK
+  # --------------------------------------------------------------------------
+  # For larger local workloads (requires more system RAM).
+  # Same JAR provisioning as spark-local (local file paths).
+  #
+  # Cost: Free (runs on your local machine)
+  # Best for: Larger datasets on powerful workstations
+  # --------------------------------------------------------------------------
+  # spark-local-large:
+  #   type: spark
+  #   description: "High-memory local Spark for large datasets"
+  #   config:
+  #     master: "local[*]"                  # Use all available cores
+  #     spark.driver.memory: "8g"
+  #     spark.executor.memory: "8g"
+  #     spark.sql.shuffle.partitions: "200"
+  #     spark.sql.adaptive.enabled: "true"
+  #     spark.sql.adaptive.coalescePartitions.enabled: "true"
+  #     spark.sql.adaptive.skewJoin.enabled: "true"
+  #     spark.memory.fraction: "0.8"
+  #     spark.memory.storageFraction: "0.3"
+# ============================================================================
+# CONFIGURATION REFERENCE
+# ============================================================================
+# Common Spark configurations:
+#
+# Memory:
+#   spark.driver.memory: "4g"          # Driver memory (default 1g)
+#   spark.executor.memory: "4g"        # Executor memory (default 1g)
+#   spark.memory.fraction: "0.6"       # Fraction for execution/storage
+#
+# Parallelism:
+#   spark.executor.cores: "4"          # Cores per executor
+#   spark.executor.instances: "4"      # Number of executors
+#   spark.sql.shuffle.partitions: "200"  # Shuffle partitions
+#   spark.default.parallelism: "100"   # Default parallelism
+#
+# Arrow (PyArrow integration):
+#   spark.sql.execution.arrow.pyspark.enabled: "true"
+#   spark.sql.execution.arrow.maxRecordsPerBatch: "10000"
+#
+# Adaptive Query Execution (Spark 3.0+):
+#   spark.sql.adaptive.enabled: "true"
+#   spark.sql.adaptive.coalescePartitions.enabled: "true"
+#   spark.sql.adaptive.skewJoin.enabled: "true"
+#
+# JDBC JAR Provisioning (v0.5.98):
+#   Local Spark:
+#     - Uses spark.jars with local file paths
+#     - Fast startup (no download needed)
+#     - JARs auto-discovered from profiles.yml
+#
+#   Remote Clusters (EMR, Dataproc, Standalone):
+#     - Uses spark.jars.packages with Maven coordinates
+#     - Workers download JARs at session start
+#     - Supported databases: PostgreSQL, MySQL, Oracle, SQL Server,
+#       Snowflake, Redshift, BigQuery, Teradata, DB2, and 30+ more
+# ============================================================================
+"""
+@dataclass
+class ComputeCluster:
+    """Configuration for an external compute cluster."""
+    name: str  # Cluster identifier
+    type: str  # 'spark' (currently only Spark supported for external)
+    config: Dict[str, Any] = field(default_factory=dict)  # Cluster-specific config
+    description: Optional[str] = None
+    cost_per_hour: Optional[float] = None  # Estimated cost per hour (USD)
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize to dictionary."""
+        result = {
+            "type": self.type,
+            "config": self.config,
+        }
+        if self.description:
+            result["description"] = self.description
+        if self.cost_per_hour is not None:
+            result["cost_per_hour"] = self.cost_per_hour
+        return result
+    @classmethod
+    def from_dict(cls, name: str, data: Dict[str, Any]) -> "ComputeCluster":
+        """Deserialize from dictionary."""
+        return cls(
+            name=name,
+            type=data.get("type", "spark"),
+            config=data.get("config", {}),
+            description=data.get("description"),
+            cost_per_hour=data.get("cost_per_hour"),
+        )
+    def detect_platform(self) -> SparkPlatform:
+        """
+        Detect Spark platform from configuration keys.
+        v0.51.2: Removed Databricks support.
+        Detection order (most specific first):
+        1. Dataproc: project + region + cluster
+        2. EMR: master=yarn (without Dataproc keys)
+        3. Standalone: master=spark://
+        4. Local: master=local[*] or no master
+        5. External: fallback for unknown configurations
+        :returns: SparkPlatform enum value
+        """
+        if self.type != "spark":
+            return SparkPlatform.EXTERNAL
+        config_keys = set(self.config.keys())
+        # 1. Dataproc: has project, region, and cluster
+        if all(k in config_keys for k in ["project", "region", "cluster"]):
+            return SparkPlatform.DATAPROC
+        # Check master value for remaining platforms
+        if "master" in config_keys:
+            master = str(self.config["master"]).lower()
+            # 2. EMR: master=yarn (without Dataproc keys)
+            if master == "yarn":
+                return SparkPlatform.EMR
+            # 3. Standalone: master=spark://
+            if master.startswith("spark://"):
+                return SparkPlatform.STANDALONE
+            # 4. Local: master=local[*]
+            if master.startswith("local"):
+                return SparkPlatform.LOCAL
+            # 5. External: unknown master format
+            return SparkPlatform.EXTERNAL
+        # Default to local (no master specified)
+        return SparkPlatform.LOCAL
+class ComputeRegistry:
+    """
+    Registry for managing external compute clusters.
+    v0.5.97: Clusters stored in ~/.dvt/.data/computes.yml (YAML format)
+             Managed exclusively via `dvt compute` CLI commands.
+    """
+    def __init__(self, project_dir: Optional[str] = None):
+        """
+        Initialize compute registry.
+        :param project_dir: Path to project root directory (for JDBC jars)
+        """
+        self.project_dir = project_dir or os.getcwd()
+        self.data_dir = get_internal_data_dir()
+        self.compute_file = self.data_dir / "computes.yml"
+        # Also check for old JSON file for migration
+        self.old_compute_file = self.data_dir / "computes.json"
+        # JDBC jars stay at project level
+        self.jdbc_jars_dir = os.path.join(self.project_dir, ".dvt", "jdbc_jars")
+        self._clusters: Dict[str, ComputeCluster] = {}
+        self._target_compute: Optional[str] = None
+        self._load()
+    def _load(self) -> None:
+        """Load clusters from internal storage."""
+        # First check for YAML file (new format)
+        if self.compute_file.exists():
+            self._load_from_yaml()
+            return
+        # Check for old JSON file and migrate
+        if self.old_compute_file.exists():
+            self._migrate_from_json()
+            return
+        # No files exist - create defaults
+        self._load_defaults()
+        self._save()
+    def _load_from_yaml(self) -> None:
+        """Load clusters from YAML file."""
+        try:
+            with open(self.compute_file, "r") as f:
+                data = yaml.safe_load(f)
+            if not data:
+                self._load_defaults()
+                return
+            # Parse target_compute (default compute engine)
+            self._target_compute = data.get("target_compute", "spark-local")
+            # Parse computes
+            computes_data = data.get("computes", {})
+            for name, cluster_data in computes_data.items():
+                if cluster_data:  # Skip None/empty entries
+                    cluster = ComputeCluster.from_dict(name, cluster_data)
+                    self._clusters[cluster.name] = cluster
+            # If no computes defined, use defaults
+            if not self._clusters:
+                self._load_defaults()
+        except Exception as e:
+            raise DbtRuntimeError(f"Failed to load compute registry: {str(e)}") from e
+    def _migrate_from_json(self) -> None:
+        """Migrate from old JSON format to YAML."""
+        import json
+        try:
+            with open(self.old_compute_file, "r") as f:
+                data = json.load(f)
+            if data:
+                self._target_compute = data.get("target_compute", "spark-local")
+                computes_data = data.get("computes", {})
+                for name, cluster_data in computes_data.items():
+                    if cluster_data:
+                        cluster_data["name"] = name
+                        cluster = ComputeCluster.from_dict(name, cluster_data)
+                        self._clusters[cluster.name] = cluster
+            if not self._clusters:
+                self._load_defaults()
+            # Save in new YAML format
+            self._save()
+            # Remove old JSON file
+            self.old_compute_file.unlink()
+        except Exception:
+            self._load_defaults()
+            self._save()
+    def _load_defaults(self) -> None:
+        """Load default out-of-box compute engines."""
+        data = yaml.safe_load(DEFAULT_COMPUTES_YAML)
+        self._target_compute = data.get("target_compute", "spark-local")
+        computes_data = data.get("computes", {})
+        for name, cluster_data in computes_data.items():
+            if cluster_data:  # Skip None entries (commented out samples)
+                cluster = ComputeCluster.from_dict(name, cluster_data)
+                self._clusters[cluster.name] = cluster
+    def _save(self) -> None:
+        """Save clusters to YAML file while preserving comments."""
+        # Ensure data directory exists
+        os.makedirs(self.data_dir, exist_ok=True)
+        # Build the YAML content with active computes
+        computes_dict = {}
+        for cluster in self._clusters.values():
+            computes_dict[cluster.name] = cluster.to_dict()
+        # If file exists, try to preserve comments by updating only the active section
+        # For simplicity, we'll write the full template with active computes
+        yaml_content = f"""# ============================================================================
+# DVT Compute Engines Configuration
+# ============================================================================
+# This file defines Spark compute engines for federated query execution.
+# Edit with: dvt compute edit
+# Validate with: dvt compute validate
+# Test with: dvt compute test
+# ============================================================================
+# Default compute engine (must match a name in 'computes' section)
+target_compute: {self._target_compute or 'spark-local'}
+computes:
+"""
+        # Add active computes
+        for name, cluster in self._clusters.items():
+            yaml_content += f"\n  {name}:\n"
+            yaml_content += f"    type: {cluster.type}\n"
+            if cluster.description:
+                yaml_content += f'    description: "{cluster.description}"\n'
+            yaml_content += "    config:\n"
+            for key, value in cluster.config.items():
+                yaml_content += f'      {key}: "{value}"\n'
+        with open(self.compute_file, "w") as f:
+            f.write(yaml_content)
+    def get_config_path(self) -> Path:
+        """Get the path to the computes.yml file."""
+        return self.compute_file
+    def ensure_config_exists(self) -> Path:
+        """Ensure the config file exists and return its path."""
+        if not self.compute_file.exists():
+            self._load_defaults()
+            # Write full template with samples
+            os.makedirs(self.data_dir, exist_ok=True)
+            with open(self.compute_file, "w") as f:
+                f.write(DEFAULT_COMPUTES_YAML)
+        return self.compute_file
+    @property
+    def target_compute(self) -> str:
+        """Get the default target compute engine."""
+        return self._target_compute or "spark-local"
+    @target_compute.setter
+    def target_compute(self, value: str) -> None:
+        """Set the default target compute engine."""
+        if value not in self._clusters:
+            raise DbtRuntimeError(
+                f"Cannot set target_compute to '{value}': compute engine not found. "
+                f"Available engines: {', '.join(self._clusters.keys())}"
+            )
+        self._target_compute = value
+        self._save()
+    def get(self, name: str) -> Optional[ComputeCluster]:
+        """
+        Get a compute cluster by name.
+        :param name: Cluster name
+        :returns: ComputeCluster or None if not found
+        """
+        return self._clusters.get(name)
+    def list(self) -> List[ComputeCluster]:
+        """
+        List all registered clusters.
+        :returns: List of ComputeCluster objects
+        """
+        return list(self._clusters.values())
+    def exists(self, name: str) -> bool:
+        """
+        Check if a cluster exists.
+        :param name: Cluster name
+        :returns: True if cluster exists
+        """
+        return name in self._clusters
+    @staticmethod
+    def ensure_jdbc_jars_dir(project_dir: str) -> None:
+        """
+        Ensure the project-level .dvt/jdbc_jars/ directory exists.
+        :param project_dir: Path to project root directory
+        """
+        jdbc_jars_dir = os.path.join(project_dir, ".dvt", "jdbc_jars")
+        os.makedirs(jdbc_jars_dir, exist_ok=True)

dbt/config/dvt_profile.cpython-310-darwin.so ADDED Viewed

Binary file