PyPI - dvt-core - Versions diffs - 0.59.0a51__py3-none-any.whl - Mend

dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

dbt/__init__.py +7 -0
dbt/_pydantic_shim.py +26 -0
dbt/artifacts/__init__.py +0 -0
dbt/artifacts/exceptions/__init__.py +1 -0
dbt/artifacts/exceptions/schemas.py +31 -0
dbt/artifacts/resources/__init__.py +116 -0
dbt/artifacts/resources/base.py +67 -0
dbt/artifacts/resources/types.py +93 -0
dbt/artifacts/resources/v1/analysis.py +10 -0
dbt/artifacts/resources/v1/catalog.py +23 -0
dbt/artifacts/resources/v1/components.py +274 -0
dbt/artifacts/resources/v1/config.py +277 -0
dbt/artifacts/resources/v1/documentation.py +11 -0
dbt/artifacts/resources/v1/exposure.py +51 -0
dbt/artifacts/resources/v1/function.py +52 -0
dbt/artifacts/resources/v1/generic_test.py +31 -0
dbt/artifacts/resources/v1/group.py +21 -0
dbt/artifacts/resources/v1/hook.py +11 -0
dbt/artifacts/resources/v1/macro.py +29 -0
dbt/artifacts/resources/v1/metric.py +172 -0
dbt/artifacts/resources/v1/model.py +145 -0
dbt/artifacts/resources/v1/owner.py +10 -0
dbt/artifacts/resources/v1/saved_query.py +111 -0
dbt/artifacts/resources/v1/seed.py +41 -0
dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
dbt/artifacts/resources/v1/semantic_model.py +314 -0
dbt/artifacts/resources/v1/singular_test.py +14 -0
dbt/artifacts/resources/v1/snapshot.py +91 -0
dbt/artifacts/resources/v1/source_definition.py +84 -0
dbt/artifacts/resources/v1/sql_operation.py +10 -0
dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
dbt/artifacts/schemas/__init__.py +0 -0
dbt/artifacts/schemas/base.py +191 -0
dbt/artifacts/schemas/batch_results.py +24 -0
dbt/artifacts/schemas/catalog/__init__.py +11 -0
dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
dbt/artifacts/schemas/freshness/__init__.py +1 -0
dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
dbt/artifacts/schemas/manifest/__init__.py +2 -0
dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
dbt/artifacts/schemas/results.py +147 -0
dbt/artifacts/schemas/run/__init__.py +2 -0
dbt/artifacts/schemas/run/v5/__init__.py +0 -0
dbt/artifacts/schemas/run/v5/run.py +184 -0
dbt/artifacts/schemas/upgrades/__init__.py +4 -0
dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
dbt/artifacts/utils/validation.py +153 -0
dbt/cli/__init__.py +1 -0
dbt/cli/context.py +17 -0
dbt/cli/exceptions.py +57 -0
dbt/cli/flags.py +560 -0
dbt/cli/main.py +2660 -0
dbt/cli/option_types.py +121 -0
dbt/cli/options.py +80 -0
dbt/cli/params.py +844 -0
dbt/cli/requires.py +490 -0
dbt/cli/resolvers.py +60 -0
dbt/cli/types.py +40 -0
dbt/clients/__init__.py +0 -0
dbt/clients/checked_load.py +83 -0
dbt/clients/git.py +164 -0
dbt/clients/jinja.py +206 -0
dbt/clients/jinja_static.py +245 -0
dbt/clients/registry.py +192 -0
dbt/clients/yaml_helper.py +68 -0
dbt/compilation.py +876 -0
dbt/compute/__init__.py +14 -0
dbt/compute/engines/__init__.py +12 -0
dbt/compute/engines/spark_engine.py +642 -0
dbt/compute/federated_executor.py +1080 -0
dbt/compute/filter_pushdown.py +273 -0
dbt/compute/jar_provisioning.py +273 -0
dbt/compute/java_compat.py +689 -0
dbt/compute/jdbc_utils.py +1252 -0
dbt/compute/metadata/__init__.py +63 -0
dbt/compute/metadata/adapters_registry.py +370 -0
dbt/compute/metadata/catalog_store.py +1036 -0
dbt/compute/metadata/registry.py +674 -0
dbt/compute/metadata/store.py +1020 -0
dbt/compute/smart_selector.py +377 -0
dbt/compute/spark_logger.py +272 -0
dbt/compute/strategies/__init__.py +55 -0
dbt/compute/strategies/base.py +165 -0
dbt/compute/strategies/dataproc.py +207 -0
dbt/compute/strategies/emr.py +203 -0
dbt/compute/strategies/local.py +472 -0
dbt/compute/strategies/standalone.py +262 -0
dbt/config/__init__.py +4 -0
dbt/config/catalogs.py +94 -0
dbt/config/compute.py +513 -0
dbt/config/dvt_profile.py +408 -0
dbt/config/profile.py +422 -0
dbt/config/project.py +888 -0
dbt/config/project_utils.py +48 -0
dbt/config/renderer.py +231 -0
dbt/config/runtime.py +564 -0
dbt/config/selectors.py +208 -0
dbt/config/utils.py +77 -0
dbt/constants.py +28 -0
dbt/context/__init__.py +0 -0
dbt/context/base.py +745 -0
dbt/context/configured.py +135 -0
dbt/context/context_config.py +382 -0
dbt/context/docs.py +82 -0
dbt/context/exceptions_jinja.py +178 -0
dbt/context/macro_resolver.py +195 -0
dbt/context/macros.py +171 -0
dbt/context/manifest.py +72 -0
dbt/context/providers.py +2249 -0
dbt/context/query_header.py +13 -0
dbt/context/secret.py +58 -0
dbt/context/target.py +74 -0
dbt/contracts/__init__.py +0 -0
dbt/contracts/files.py +413 -0
dbt/contracts/graph/__init__.py +0 -0
dbt/contracts/graph/manifest.py +1904 -0
dbt/contracts/graph/metrics.py +97 -0
dbt/contracts/graph/model_config.py +70 -0
dbt/contracts/graph/node_args.py +42 -0
dbt/contracts/graph/nodes.py +1806 -0
dbt/contracts/graph/semantic_manifest.py +232 -0
dbt/contracts/graph/unparsed.py +811 -0
dbt/contracts/project.py +419 -0
dbt/contracts/results.py +53 -0
dbt/contracts/selection.py +23 -0
dbt/contracts/sql.py +85 -0
dbt/contracts/state.py +68 -0
dbt/contracts/util.py +46 -0
dbt/deprecations.py +348 -0
dbt/deps/__init__.py +0 -0
dbt/deps/base.py +152 -0
dbt/deps/git.py +195 -0
dbt/deps/local.py +79 -0
dbt/deps/registry.py +130 -0
dbt/deps/resolver.py +149 -0
dbt/deps/tarball.py +120 -0
dbt/docs/source/_ext/dbt_click.py +119 -0
dbt/docs/source/conf.py +32 -0
dbt/env_vars.py +64 -0
dbt/event_time/event_time.py +40 -0
dbt/event_time/sample_window.py +60 -0
dbt/events/__init__.py +15 -0
dbt/events/base_types.py +36 -0
dbt/events/core_types_pb2.py +2 -0
dbt/events/logging.py +108 -0
dbt/events/types.py +2516 -0
dbt/exceptions.py +1486 -0
dbt/flags.py +89 -0
dbt/graph/__init__.py +11 -0
dbt/graph/cli.py +249 -0
dbt/graph/graph.py +172 -0
dbt/graph/queue.py +214 -0
dbt/graph/selector.py +374 -0
dbt/graph/selector_methods.py +975 -0
dbt/graph/selector_spec.py +222 -0
dbt/graph/thread_pool.py +18 -0
dbt/hooks.py +21 -0
dbt/include/README.md +49 -0
dbt/include/__init__.py +3 -0
dbt/include/data/adapters_registry.duckdb +0 -0
dbt/include/data/build_comprehensive_registry.py +1254 -0
dbt/include/data/build_registry.py +242 -0
dbt/include/data/csv/adapter_queries.csv +33 -0
dbt/include/data/csv/syntax_rules.csv +9 -0
dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
dbt/include/data/csv/type_mappings_databricks.csv +30 -0
dbt/include/data/csv/type_mappings_mysql.csv +40 -0
dbt/include/data/csv/type_mappings_oracle.csv +30 -0
dbt/include/data/csv/type_mappings_postgres.csv +56 -0
dbt/include/data/csv/type_mappings_redshift.csv +33 -0
dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
dbt/include/dvt_starter_project/README.md +15 -0
dbt/include/dvt_starter_project/__init__.py +3 -0
dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
dbt/include/dvt_starter_project/dvt_project.yml +39 -0
dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
dbt/internal_deprecations.py +26 -0
dbt/jsonschemas/__init__.py +3 -0
dbt/jsonschemas/jsonschemas.py +309 -0
dbt/jsonschemas/project/0.0.110.json +4717 -0
dbt/jsonschemas/project/0.0.85.json +2015 -0
dbt/jsonschemas/resources/0.0.110.json +2636 -0
dbt/jsonschemas/resources/0.0.85.json +2536 -0
dbt/jsonschemas/resources/latest.json +6773 -0
dbt/links.py +4 -0
dbt/materializations/__init__.py +0 -0
dbt/materializations/incremental/__init__.py +0 -0
dbt/materializations/incremental/microbatch.py +236 -0
dbt/mp_context.py +8 -0
dbt/node_types.py +37 -0
dbt/parser/__init__.py +23 -0
dbt/parser/analysis.py +21 -0
dbt/parser/base.py +548 -0
dbt/parser/common.py +266 -0
dbt/parser/docs.py +52 -0
dbt/parser/fixtures.py +51 -0
dbt/parser/functions.py +30 -0
dbt/parser/generic_test.py +100 -0
dbt/parser/generic_test_builders.py +333 -0
dbt/parser/hooks.py +122 -0
dbt/parser/macros.py +137 -0
dbt/parser/manifest.py +2208 -0
dbt/parser/models.py +573 -0
dbt/parser/partial.py +1178 -0
dbt/parser/read_files.py +445 -0
dbt/parser/schema_generic_tests.py +422 -0
dbt/parser/schema_renderer.py +111 -0
dbt/parser/schema_yaml_readers.py +935 -0
dbt/parser/schemas.py +1466 -0
dbt/parser/search.py +149 -0
dbt/parser/seeds.py +28 -0
dbt/parser/singular_test.py +20 -0
dbt/parser/snapshots.py +44 -0
dbt/parser/sources.py +558 -0
dbt/parser/sql.py +62 -0
dbt/parser/unit_tests.py +621 -0
dbt/plugins/__init__.py +20 -0
dbt/plugins/contracts.py +9 -0
dbt/plugins/exceptions.py +2 -0
dbt/plugins/manager.py +163 -0
dbt/plugins/manifest.py +21 -0
dbt/profiler.py +20 -0
dbt/py.typed +1 -0
dbt/query_analyzer.py +410 -0
dbt/runners/__init__.py +2 -0
dbt/runners/exposure_runner.py +7 -0
dbt/runners/no_op_runner.py +45 -0
dbt/runners/saved_query_runner.py +7 -0
dbt/selected_resources.py +8 -0
dbt/task/__init__.py +0 -0
dbt/task/base.py +506 -0
dbt/task/build.py +197 -0
dbt/task/clean.py +56 -0
dbt/task/clone.py +161 -0
dbt/task/compile.py +150 -0
dbt/task/compute.py +458 -0
dbt/task/debug.py +513 -0
dbt/task/deps.py +280 -0
dbt/task/docs/__init__.py +3 -0
dbt/task/docs/api/__init__.py +23 -0
dbt/task/docs/api/catalog.py +204 -0
dbt/task/docs/api/lineage.py +234 -0
dbt/task/docs/api/profile.py +204 -0
dbt/task/docs/api/spark.py +186 -0
dbt/task/docs/generate.py +1002 -0
dbt/task/docs/index.html +250 -0
dbt/task/docs/serve.py +174 -0
dbt/task/dvt_output.py +509 -0
dbt/task/dvt_run.py +282 -0
dbt/task/dvt_seed.py +806 -0
dbt/task/freshness.py +322 -0
dbt/task/function.py +121 -0
dbt/task/group_lookup.py +46 -0
dbt/task/init.py +1022 -0
dbt/task/java.py +316 -0
dbt/task/list.py +236 -0
dbt/task/metadata.py +804 -0
dbt/task/migrate.py +714 -0
dbt/task/printer.py +175 -0
dbt/task/profile.py +1489 -0
dbt/task/profile_serve.py +662 -0
dbt/task/retract.py +441 -0
dbt/task/retry.py +175 -0
dbt/task/run.py +1647 -0
dbt/task/run_operation.py +141 -0
dbt/task/runnable.py +758 -0
dbt/task/seed.py +103 -0
dbt/task/show.py +149 -0
dbt/task/snapshot.py +56 -0
dbt/task/spark.py +414 -0
dbt/task/sql.py +110 -0
dbt/task/target_sync.py +814 -0
dbt/task/test.py +464 -0
dbt/tests/fixtures/__init__.py +1 -0
dbt/tests/fixtures/project.py +620 -0
dbt/tests/util.py +651 -0
dbt/tracking.py +529 -0
dbt/utils/__init__.py +3 -0
dbt/utils/artifact_upload.py +151 -0
dbt/utils/utils.py +408 -0
dbt/version.py +271 -0
dvt_cli/__init__.py +158 -0
dvt_core-0.59.0a51.dist-info/METADATA +288 -0
dvt_core-0.59.0a51.dist-info/RECORD +299 -0
dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0

dbt/compute/smart_selector.py ADDED Viewed

@@ -0,0 +1,377 @@
+"""
+Smart Compute Engine Selector
+Selects compute engine based on DVT compute rules (NOT size-based).
+v0.56.0: Refactored to follow DVT compute rules:
+1. CLI --target-compute override (highest priority)
+2. Model-level config {{ config(compute='...') }}
+3. Default from computes.yml target_compute
+4. Pushdown when model and all inputs are in same target (no Spark needed)
+Selection is deterministic based on configuration, not data characteristics.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Optional, Set
+from dbt.contracts.graph.manifest import Manifest
+from dbt.contracts.graph.nodes import ManifestNode
+from dbt.query_analyzer import QueryAnalysisResult
+from dbt_common.exceptions import DbtRuntimeError
+class ExecutionStrategy(Enum):
+    """Execution strategy for a node."""
+    PUSHDOWN = "pushdown"  # Execute directly on target adapter (same connection)
+    FEDERATED = "federated"  # Execute via Spark for cross-target queries
+@dataclass
+class WorkloadEstimate:
+    """Estimated workload characteristics for a query."""
+    estimated_rows: int  # Estimated total rows to process
+    source_count: int  # Number of source tables
+    connection_count: int  # Number of different connections
+    has_aggregations: bool  # Query contains GROUP BY or aggregations
+    has_joins: bool  # Query contains JOIN operations
+    complexity_score: float  # 0.0 to 1.0, higher = more complex
+    @property
+    def estimated_data_mb(self) -> float:
+        """Rough estimate of data size in MB (assuming ~100 bytes/row)."""
+        return (self.estimated_rows * 100) / (1024 * 1024)
+class SmartComputeSelector:
+    """
+    Selects compute engine based on DVT compute rules.
+    v0.56.0: Rule-based selection (NO size-based logic).
+    Selection hierarchy (highest to lowest priority):
+    1. CLI --target-compute override
+    2. Model config: {{ config(compute='spark-cluster') }}
+    3. Default from computes.yml target_compute
+    Execution strategy:
+    - PUSHDOWN: When model and all inputs are in same target
+    - FEDERATED: When sources span multiple targets (requires Spark)
+    """
+    def __init__(
+        self,
+        manifest: Manifest,
+        compute_registry: Optional[Any] = None,
+        cli_target_compute: Optional[str] = None,
+    ):
+        """
+        Initialize smart selector.
+        :param manifest: The dbt manifest
+        :param compute_registry: ComputeRegistry instance for compute configuration
+        :param cli_target_compute: CLI --target-compute override (highest priority)
+        """
+        self.manifest = manifest
+        self.compute_registry = compute_registry
+        self.cli_target_compute = cli_target_compute
+    def select_engine(
+        self,
+        node: ManifestNode,
+        analysis_result: QueryAnalysisResult,
+        cli_override: Optional[str] = None,
+    ) -> str:
+        """
+        Select compute engine based on DVT rules.
+        v0.56.0: Rule-based selection (no size-based logic).
+        Priority:
+        1. cli_override parameter (passed at call time)
+        2. self.cli_target_compute (passed at init time)
+        3. Model config: {{ config(compute='...') }}
+        4. Default from computes.yml target_compute
+        :param node: The node to execute
+        :param analysis_result: Query analysis result
+        :param cli_override: CLI --target-compute override
+        :returns: Compute engine name (e.g., "spark-local", "spark-cluster")
+        :raises DbtRuntimeError: If specified compute doesn't exist
+        """
+        # Determine execution strategy first
+        strategy = self._determine_execution_strategy(node, analysis_result)
+        # For pushdown, no Spark compute needed
+        if strategy == ExecutionStrategy.PUSHDOWN:
+            return "pushdown"
+        # For federated execution, select compute engine
+        return self._select_compute_for_federation(node, cli_override)
+    def _determine_execution_strategy(
+        self, node: ManifestNode, analysis_result: QueryAnalysisResult
+    ) -> ExecutionStrategy:
+        """
+        Determine whether to use pushdown or federation.
+        DVT Rule: Pushdown when model and ALL inputs are in same target.
+        :param node: The node to analyze
+        :param analysis_result: Query analysis result
+        :returns: ExecutionStrategy (PUSHDOWN or FEDERATED)
+        """
+        # Get target connection for this node
+        node_target = self._get_node_target(node)
+        # Get all source connections
+        source_connections = analysis_result.source_connections
+        # If no sources, can use pushdown (pure computation)
+        if not source_connections:
+            return ExecutionStrategy.PUSHDOWN
+        # Check if all sources are in the same connection as the target
+        if len(source_connections) == 1:
+            source_connection = next(iter(source_connections))
+            if source_connection == node_target:
+                # Same connection - use pushdown
+                return ExecutionStrategy.PUSHDOWN
+        # Multiple connections or different target - must federate
+        return ExecutionStrategy.FEDERATED
+    def _get_node_target(self, node: ManifestNode) -> str:
+        """
+        Get the target connection for a node.
+        :param node: The manifest node
+        :returns: Target connection name
+        """
+        # Check if node has explicit target config
+        if hasattr(node, "config") and hasattr(node.config, "target"):
+            if node.config.target:
+                return node.config.target
+        # Otherwise, use default target from manifest
+        # Note: In DVT, this comes from profiles.yml default target
+        return "default"
+    def _select_compute_for_federation(
+        self, node: ManifestNode, cli_override: Optional[str] = None
+    ) -> str:
+        """
+        Select compute engine for federated execution.
+        Priority:
+        1. cli_override parameter (passed at call time)
+        2. self.cli_target_compute (passed at init time)
+        3. Model config: {{ config(compute='...') }}
+        4. Default from computes.yml target_compute
+        :param node: The node to execute
+        :param cli_override: CLI --target-compute override
+        :returns: Compute engine name
+        :raises DbtRuntimeError: If specified compute doesn't exist
+        """
+        compute_name = None
+        # Priority 1: CLI override (call-time)
+        if cli_override:
+            compute_name = cli_override
+        # Priority 2: CLI override (init-time)
+        elif self.cli_target_compute:
+            compute_name = self.cli_target_compute
+        # Priority 3: Model-level config
+        elif hasattr(node, "config") and hasattr(node.config, "compute"):
+            if node.config.compute:
+                compute_name = node.config.compute
+        # Priority 4: Default from computes.yml
+        elif self.compute_registry:
+            compute_name = self.compute_registry.target_compute
+        # Fallback if no registry
+        if not compute_name:
+            compute_name = "spark-local"
+        # Validate the compute engine exists
+        if self.compute_registry and not self.compute_registry.exists(compute_name):
+            available = [c.name for c in self.compute_registry.list()]
+            raise DbtRuntimeError(
+                f"Compute engine '{compute_name}' not found. "
+                f"Available engines: {', '.join(available)}"
+            )
+        return compute_name
+    def _estimate_workload(
+        self, node: ManifestNode, analysis_result: QueryAnalysisResult
+    ) -> WorkloadEstimate:
+        """
+        Estimate workload characteristics for a node.
+        Note: Used for informational purposes only, NOT for compute selection.
+        :param node: The node to analyze
+        :param analysis_result: Query analysis result
+        :returns: WorkloadEstimate
+        """
+        # Count sources
+        source_count = len(analysis_result.source_refs)
+        connection_count = len(analysis_result.source_connections)
+        # Estimate row count (informational only)
+        estimated_rows = self._estimate_row_count(analysis_result.source_refs)
+        # Analyze SQL for complexity (informational only)
+        sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
+        has_aggregations = self._has_aggregations(sql)
+        has_joins = self._has_joins(sql)
+        # Calculate complexity score (informational only)
+        complexity_score = self._calculate_complexity(
+            source_count=source_count,
+            connection_count=connection_count,
+            has_aggregations=has_aggregations,
+            has_joins=has_joins,
+        )
+        return WorkloadEstimate(
+            estimated_rows=estimated_rows,
+            source_count=source_count,
+            connection_count=connection_count,
+            has_aggregations=has_aggregations,
+            has_joins=has_joins,
+            complexity_score=complexity_score,
+        )
+    def _estimate_row_count(self, source_refs: set) -> int:
+        """
+        Estimate total row count from source tables.
+        Note: Used for informational purposes only.
+        :param source_refs: Set of source unique_ids
+        :returns: Estimated row count
+        """
+        total_rows = 0
+        for source_id in source_refs:
+            source = self.manifest.sources.get(source_id)
+            if not source:
+                total_rows += 100000
+                continue
+            # Heuristic based on naming (informational only)
+            if (
+                "fact" in source.identifier.lower()
+                or "events" in source.identifier.lower()
+            ):
+                total_rows += 1000000
+            elif (
+                "dim" in source.identifier.lower()
+                or "lookup" in source.identifier.lower()
+            ):
+                total_rows += 10000
+            else:
+                total_rows += 100000
+        return total_rows
+    def _has_aggregations(self, sql: str) -> bool:
+        """Check if SQL contains aggregations."""
+        sql_upper = sql.upper()
+        return any(
+            keyword in sql_upper
+            for keyword in [
+                " GROUP BY ",
+                " SUM(",
+                " COUNT(",
+                " AVG(",
+                " MIN(",
+                " MAX(",
+                " HAVING ",
+            ]
+        )
+    def _has_joins(self, sql: str) -> bool:
+        """Check if SQL contains joins."""
+        sql_upper = sql.upper()
+        return any(
+            keyword in sql_upper
+            for keyword in [
+                " JOIN ",
+                " INNER JOIN ",
+                " LEFT JOIN ",
+                " RIGHT JOIN ",
+                " FULL JOIN ",
+                " CROSS JOIN ",
+            ]
+        )
+    def _calculate_complexity(
+        self,
+        source_count: int,
+        connection_count: int,
+        has_aggregations: bool,
+        has_joins: bool,
+    ) -> float:
+        """Calculate query complexity score (0.0 to 1.0)."""
+        score = 0.0
+        score += min(source_count / 10.0, 0.3)
+        score += min(connection_count / 5.0, 0.2)
+        if has_aggregations:
+            score += 0.2
+        if has_joins:
+            score += 0.3
+        return min(score, 1.0)
+    def get_execution_strategy(
+        self, node: ManifestNode, analysis_result: QueryAnalysisResult
+    ) -> ExecutionStrategy:
+        """
+        Get the execution strategy for a node (public API).
+        :param node: The node
+        :param analysis_result: Query analysis result
+        :returns: ExecutionStrategy enum
+        """
+        return self._determine_execution_strategy(node, analysis_result)
+    def get_recommendation_reason(
+        self, node: ManifestNode, analysis_result: QueryAnalysisResult
+    ) -> str:
+        """
+        Get human-readable explanation for engine selection.
+        :param node: The node
+        :param analysis_result: Query analysis result
+        :returns: Explanation string
+        """
+        strategy = self._determine_execution_strategy(node, analysis_result)
+        if strategy == ExecutionStrategy.PUSHDOWN:
+            return "Pushdown: All sources in same target connection - executing directly"
+        # Federated execution
+        engine = self._select_compute_for_federation(node)
+        estimate = self._estimate_workload(node, analysis_result)
+        reasons = []
+        reasons.append(f"Cross-target query ({estimate.connection_count} connections)")
+        if self.cli_target_compute:
+            reasons.append(f"CLI override: --target-compute {self.cli_target_compute}")
+        elif hasattr(node, "config") and hasattr(node.config, "compute") and node.config.compute:
+            reasons.append(f"Model config: compute='{node.config.compute}'")
+        else:
+            reasons.append("Using default from computes.yml")
+        reason_str = "; ".join(reasons)
+        return f"Federated ({engine}): {reason_str}"

dbt/compute/spark_logger.py ADDED Viewed

@@ -0,0 +1,272 @@
+# =============================================================================
+# DVT Spark Output Logger
+# =============================================================================
+# Captures Spark/compute output to log files for debugging while keeping
+# console clean with progress bars.
+#
+# DVT v0.59.0a36: New module for Spark output capture
+# =============================================================================
+from __future__ import annotations
+import sys
+import threading
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, TextIO
+class TeeWriter:
+    """
+    A writer that writes to both the original stream and a log file.
+    This allows us to capture Spark output to a log file while still
+    passing it through (though in practice we suppress console output
+    by using Rich's Live display which takes over the terminal).
+    """
+    def __init__(self, original: TextIO, log_file: TextIO, suppress_console: bool = True):
+        self.original = original
+        self.log_file = log_file
+        self.suppress_console = suppress_console
+        self._lock = threading.Lock()
+    def write(self, data: str) -> int:
+        """Write data to log file and optionally to original stream."""
+        with self._lock:
+            # Always write to log file
+            try:
+                self.log_file.write(data)
+                self.log_file.flush()
+            except Exception:
+                pass  # Don't break if log file write fails
+            # Write to original only if not suppressing console
+            if not self.suppress_console:
+                return self.original.write(data)
+            return len(data)
+    def flush(self) -> None:
+        """Flush both streams."""
+        with self._lock:
+            try:
+                self.log_file.flush()
+            except Exception:
+                pass
+            if not self.suppress_console:
+                self.original.flush()
+    def fileno(self) -> int:
+        """Return the file descriptor of the original stream."""
+        return self.original.fileno()
+    def isatty(self) -> bool:
+        """Return whether the original stream is a tty."""
+        return self.original.isatty()
+class SparkOutputLogger:
+    """
+    Captures Spark/compute stderr and stdout to a log file.
+    The log file is written to target/{compute_name}_log.txt and overwrites
+    each time a new session starts. Each session is separated by a clear
+    header with timestamp.
+    Usage:
+        logger = SparkOutputLogger.get_instance(target_dir="/path/to/target", compute_name="spark")
+        logger.start_session()
+        # ... Spark operations ...
+        logger.end_session()
+    The logger is a singleton per (target_dir, compute_name) combination.
+    """
+    _instances: dict[tuple[str, str], "SparkOutputLogger"] = {}
+    _global_lock = threading.Lock()
+    def __init__(self, target_dir: str, compute_name: str = "spark"):
+        """
+        Initialize the Spark output logger.
+        Args:
+            target_dir: Path to the dbt target directory
+            compute_name: Name of the compute engine (used in log filename)
+        """
+        self.target_dir = Path(target_dir)
+        self.compute_name = compute_name
+        self.log_path = self.target_dir / f"{compute_name}_log.txt"
+        self._log_file: Optional[TextIO] = None
+        self._original_stderr: Optional[TextIO] = None
+        self._original_stdout: Optional[TextIO] = None
+        self._tee_stderr: Optional[TeeWriter] = None
+        self._tee_stdout: Optional[TeeWriter] = None
+        self._session_active = False
+        self._lock = threading.Lock()
+    @classmethod
+    def get_instance(cls, target_dir: str, compute_name: str = "spark") -> "SparkOutputLogger":
+        """
+        Get or create a singleton instance for the given target_dir and compute_name.
+        Args:
+            target_dir: Path to the dbt target directory
+            compute_name: Name of the compute engine
+        Returns:
+            SparkOutputLogger instance
+        """
+        key = (str(target_dir), compute_name)
+        with cls._global_lock:
+            if key not in cls._instances:
+                cls._instances[key] = cls(target_dir, compute_name)
+            return cls._instances[key]
+    def start_session(self, suppress_console: bool = True) -> None:
+        """
+        Start a new logging session.
+        This overwrites the previous log file and writes a session header.
+        stderr and stdout are redirected to capture Spark output.
+        Args:
+            suppress_console: If True, suppress output to console (default: True)
+        """
+        with self._lock:
+            if self._session_active:
+                return  # Already active
+            try:
+                # Ensure target directory exists
+                self.target_dir.mkdir(parents=True, exist_ok=True)
+                # Open log file (overwrite mode)
+                self._log_file = open(self.log_path, 'w', encoding='utf-8')
+                # Write session header
+                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                self._log_file.write("=" * 80 + "\n")
+                self._log_file.write(f"  DVT {self.compute_name.upper()} LOG\n")
+                self._log_file.write(f"  Session started: {timestamp}\n")
+                self._log_file.write("=" * 80 + "\n\n")
+                self._log_file.flush()
+                # Save original streams
+                self._original_stderr = sys.stderr
+                self._original_stdout = sys.stdout
+                # Create tee writers
+                self._tee_stderr = TeeWriter(
+                    self._original_stderr,
+                    self._log_file,
+                    suppress_console=suppress_console,
+                )
+                self._tee_stdout = TeeWriter(
+                    self._original_stdout,
+                    self._log_file,
+                    suppress_console=suppress_console,
+                )
+                # Redirect stderr and stdout
+                sys.stderr = self._tee_stderr  # type: ignore
+                sys.stdout = self._tee_stdout  # type: ignore
+                self._session_active = True
+            except Exception as e:
+                # Don't break the application if logging fails
+                self._cleanup()
+                # Optionally log the error
+                try:
+                    if self._original_stderr:
+                        self._original_stderr.write(f"[DVT] Warning: Could not start Spark logging: {e}\n")
+                except Exception:
+                    pass
+    def write_separator(self, label: str = "") -> None:
+        """
+        Write a separator line to the log file.
+        Useful for marking different phases of Spark execution.
+        Args:
+            label: Optional label for the separator
+        """
+        with self._lock:
+            if self._log_file:
+                try:
+                    timestamp = datetime.now().strftime("%H:%M:%S")
+                    if label:
+                        self._log_file.write(f"\n--- [{timestamp}] {label} ---\n\n")
+                    else:
+                        self._log_file.write(f"\n--- [{timestamp}] ---\n\n")
+                    self._log_file.flush()
+                except Exception:
+                    pass
+    def end_session(self) -> None:
+        """
+        End the logging session.
+        Restores original stderr and stdout, closes the log file.
+        """
+        with self._lock:
+            if not self._session_active:
+                return
+            try:
+                # Write session footer
+                if self._log_file:
+                    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    self._log_file.write("\n")
+                    self._log_file.write("=" * 80 + "\n")
+                    self._log_file.write(f"  Session ended: {timestamp}\n")
+                    self._log_file.write("=" * 80 + "\n")
+            except Exception:
+                pass
+            self._cleanup()
+    def _cleanup(self) -> None:
+        """Restore original streams and close log file."""
+        # Restore original streams
+        if self._original_stderr:
+            sys.stderr = self._original_stderr
+            self._original_stderr = None
+        if self._original_stdout:
+            sys.stdout = self._original_stdout
+            self._original_stdout = None
+        # Close log file
+        if self._log_file:
+            try:
+                self._log_file.close()
+            except Exception:
+                pass
+            self._log_file = None
+        self._tee_stderr = None
+        self._tee_stdout = None
+        self._session_active = False
+    def __del__(self):
+        """Ensure cleanup on deletion."""
+        self._cleanup()
+# Convenience function for getting the logger
+def get_spark_logger(target_dir: str, compute_name: str = "spark") -> SparkOutputLogger:
+    """
+    Get a Spark output logger for the given target directory.
+    Args:
+        target_dir: Path to the dbt target directory
+        compute_name: Name of the compute engine (default: "spark")
+    Returns:
+        SparkOutputLogger instance
+    """
+    return SparkOutputLogger.get_instance(target_dir, compute_name)

dbt/compute/strategies/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""
+Spark Connection Strategies
+This module provides different strategies for connecting to Spark clusters.
+Uses the strategy pattern for flexible platform support.
+v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
+v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
+"""
+from dbt.compute.strategies.base import BaseConnectionStrategy
+from dbt.compute.strategies.local import LocalStrategy, cleanup_all_spark_sessions
+# Strategies are imported lazily to avoid import errors when
+# optional dependencies are not installed
+def get_emr_strategy():
+    """
+    Lazily import and return EMRStrategy.
+    :returns: EMRStrategy class
+    """
+    from dbt.compute.strategies.emr import EMRStrategy
+    return EMRStrategy
+def get_dataproc_strategy():
+    """
+    Lazily import and return DataprocStrategy.
+    :returns: DataprocStrategy class
+    """
+    from dbt.compute.strategies.dataproc import DataprocStrategy
+    return DataprocStrategy
+def get_standalone_strategy():
+    """
+    Lazily import and return StandaloneStrategy.
+    :returns: StandaloneStrategy class
+    """
+    from dbt.compute.strategies.standalone import StandaloneStrategy
+    return StandaloneStrategy
+__all__ = [
+    "BaseConnectionStrategy",
+    "LocalStrategy",
+    "cleanup_all_spark_sessions",
+    "get_emr_strategy",
+    "get_dataproc_strategy",
+    "get_standalone_strategy",
+]