dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local Spark Connection Strategy
|
|
3
|
+
|
|
4
|
+
Provides embedded PySpark session for local development and testing.
|
|
5
|
+
This is the default strategy extracted from the original SparkEngine implementation.
|
|
6
|
+
|
|
7
|
+
Includes auto-configuration of Java with PySpark compatibility checking.
|
|
8
|
+
|
|
9
|
+
v0.51.3: Refactored to use java_compat module for centralized Java/PySpark compatibility.
|
|
10
|
+
v0.5.98: Added JAR provisioning using local file paths (spark.jars).
|
|
11
|
+
v0.58.5: Fixed Java 21 segfaults by NOT loading jdk.incubator.vector module.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from typing import Dict, Optional, Set, Tuple
|
|
16
|
+
|
|
17
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
18
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from pyspark.sql import SparkSession
|
|
22
|
+
|
|
23
|
+
PYSPARK_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
PYSPARK_AVAILABLE = False
|
|
26
|
+
SparkSession = None
|
|
27
|
+
|
|
28
|
+
# Global Spark session cache for reuse across calls (within same process)
|
|
29
|
+
_SPARK_SESSION_CACHE = {}
|
|
30
|
+
|
|
31
|
+
# Thread lock for safe session management
|
|
32
|
+
import threading
|
|
33
|
+
_SPARK_SESSION_LOCK = threading.Lock()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def cleanup_all_spark_sessions():
|
|
37
|
+
"""
|
|
38
|
+
Clean up ALL cached Spark sessions.
|
|
39
|
+
|
|
40
|
+
DVT v0.58.4: Call this at the end of runs to prevent semaphore leaks
|
|
41
|
+
and segfaults when the thread pool terminates.
|
|
42
|
+
|
|
43
|
+
Thread-safe - uses lock for cache access.
|
|
44
|
+
"""
|
|
45
|
+
global _SPARK_SESSION_CACHE
|
|
46
|
+
|
|
47
|
+
with _SPARK_SESSION_LOCK:
|
|
48
|
+
for cache_key, spark in list(_SPARK_SESSION_CACHE.items()):
|
|
49
|
+
try:
|
|
50
|
+
spark.stop()
|
|
51
|
+
except Exception:
|
|
52
|
+
pass # Best effort cleanup
|
|
53
|
+
_SPARK_SESSION_CACHE.clear()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _disable_multiprocessing_resource_tracker():
|
|
57
|
+
"""
|
|
58
|
+
Disable Python's multiprocessing resource tracker to prevent segfaults.
|
|
59
|
+
|
|
60
|
+
DVT v0.58.5: PySpark 4.0 + Java 21 creates semaphores that conflict with
|
|
61
|
+
Python's resource tracker during shutdown, causing segfaults on macOS.
|
|
62
|
+
Disabling the tracker prevents these conflicts.
|
|
63
|
+
"""
|
|
64
|
+
import multiprocessing
|
|
65
|
+
try:
|
|
66
|
+
# Disable resource tracking for semaphores
|
|
67
|
+
from multiprocessing import resource_tracker
|
|
68
|
+
# Replace the tracker's main function with a no-op
|
|
69
|
+
resource_tracker._resource_tracker = None
|
|
70
|
+
resource_tracker._fd = None
|
|
71
|
+
except Exception:
|
|
72
|
+
pass # Best effort - if it fails, continue anyway
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _ensure_java_available():
|
|
76
|
+
"""
|
|
77
|
+
Ensure Java is available and compatible with installed PySpark.
|
|
78
|
+
|
|
79
|
+
Uses the centralized java_compat module for cross-platform Java detection
|
|
80
|
+
and PySpark compatibility checking.
|
|
81
|
+
|
|
82
|
+
v0.51.3: Refactored to use java_compat module with enhanced compatibility checking.
|
|
83
|
+
Always sets JAVA_HOME to a proper JDK path (not /usr or invalid paths).
|
|
84
|
+
"""
|
|
85
|
+
from dbt.compute.java_compat import (
|
|
86
|
+
get_pyspark_info,
|
|
87
|
+
find_all_java_installations,
|
|
88
|
+
select_best_java,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Get PySpark requirements
|
|
92
|
+
pyspark = get_pyspark_info()
|
|
93
|
+
if not pyspark:
|
|
94
|
+
raise DbtRuntimeError(
|
|
95
|
+
"PySpark is not installed. Install it with: pip install pyspark\n"
|
|
96
|
+
"Or run 'dvt spark set-version' to select a specific version."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Always search for Java installations and select the best one
|
|
100
|
+
# This ensures JAVA_HOME is set to a proper JDK path (not /usr or invalid)
|
|
101
|
+
all_java = find_all_java_installations()
|
|
102
|
+
best_java = select_best_java(all_java, pyspark.java_supported)
|
|
103
|
+
|
|
104
|
+
if best_java:
|
|
105
|
+
# Set JAVA_HOME to the best compatible Java found
|
|
106
|
+
# This is needed even if Java is in PATH because PySpark's scripts
|
|
107
|
+
# rely on JAVA_HOME being set to a proper JDK directory
|
|
108
|
+
os.environ["JAVA_HOME"] = best_java.path
|
|
109
|
+
bin_path = os.path.join(best_java.path, "bin")
|
|
110
|
+
# Prepend to PATH to ensure this Java is used
|
|
111
|
+
os.environ["PATH"] = bin_path + os.pathsep + os.environ.get("PATH", "")
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
# No compatible Java found - show error with guidance
|
|
115
|
+
supported_str = ", ".join(str(v) for v in pyspark.java_supported)
|
|
116
|
+
raise DbtRuntimeError(
|
|
117
|
+
f"No compatible Java found for PySpark {pyspark.version}.\n"
|
|
118
|
+
f"PySpark {pyspark.major_minor} requires Java {supported_str}.\n\n"
|
|
119
|
+
f"Run 'dvt java search' to find Java installations.\n"
|
|
120
|
+
f"Run 'dvt java set' to select a compatible version.\n"
|
|
121
|
+
f"Run 'dvt java install' for installation guide."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class LocalStrategy(BaseConnectionStrategy):
|
|
126
|
+
"""
|
|
127
|
+
Local embedded Spark strategy.
|
|
128
|
+
|
|
129
|
+
Creates an in-process PySpark session with local[*] master.
|
|
130
|
+
Best for development, testing, and small-medium workloads.
|
|
131
|
+
|
|
132
|
+
Configuration:
|
|
133
|
+
{
|
|
134
|
+
"master": "local[*]", # optional, defaults to local[*]
|
|
135
|
+
"spark.driver.memory": "4g", # optional
|
|
136
|
+
"spark.executor.memory": "4g", # optional
|
|
137
|
+
# ... any other Spark configs
|
|
138
|
+
}
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
def validate_config(self) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Validate local strategy configuration.
|
|
144
|
+
|
|
145
|
+
Local strategy is flexible - no required fields.
|
|
146
|
+
"""
|
|
147
|
+
# Local strategy accepts any config - very flexible
|
|
148
|
+
# Just ensure it's a dictionary
|
|
149
|
+
if not isinstance(self.config, dict):
|
|
150
|
+
raise DbtRuntimeError(
|
|
151
|
+
f"Local Spark config must be a dictionary, got {type(self.config)}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
|
|
155
|
+
"""
|
|
156
|
+
Create or reuse local Spark session (BLAZING FAST).
|
|
157
|
+
|
|
158
|
+
Creates an embedded PySpark session with optimized configuration for speed.
|
|
159
|
+
Implements session caching to reuse existing sessions.
|
|
160
|
+
|
|
161
|
+
DVT v0.5.3: Uses direct JAR paths instead of spark.jars.packages to avoid
|
|
162
|
+
verbose Ivy output. JARs are downloaded once and cached in ~/.dvt/jdbc_jars/
|
|
163
|
+
|
|
164
|
+
:param adapter_types: Set of adapter types that need JDBC drivers (optional, for API compatibility)
|
|
165
|
+
:returns: Initialized SparkSession
|
|
166
|
+
:raises DbtRuntimeError: If session creation fails
|
|
167
|
+
"""
|
|
168
|
+
import sys
|
|
169
|
+
import hashlib
|
|
170
|
+
|
|
171
|
+
if not PYSPARK_AVAILABLE:
|
|
172
|
+
raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
|
|
173
|
+
|
|
174
|
+
# DVT v0.58.5: Disable resource tracker BEFORE any JVM operations
|
|
175
|
+
# This prevents segfaults caused by semaphore cleanup conflicts
|
|
176
|
+
_disable_multiprocessing_resource_tracker()
|
|
177
|
+
|
|
178
|
+
# DVT v0.58.5: Set JVM options BEFORE Java starts to disable vector API
|
|
179
|
+
# This must happen before PySpark creates the JVM
|
|
180
|
+
_java_opts = os.environ.get("_JAVA_OPTIONS", "")
|
|
181
|
+
if "-XX:-UseVectorCmov" not in _java_opts:
|
|
182
|
+
os.environ["_JAVA_OPTIONS"] = f"{_java_opts} -XX:-UseVectorCmov -XX:-UseSIMDForMemoryOps -XX:+IgnoreUnrecognizedVMOptions".strip()
|
|
183
|
+
|
|
184
|
+
# Auto-configure Java first
|
|
185
|
+
_ensure_java_available()
|
|
186
|
+
|
|
187
|
+
# Create cache key from config to reuse sessions with same configuration
|
|
188
|
+
config_str = str(sorted(self.config.items()))
|
|
189
|
+
cache_key = hashlib.md5(config_str.encode()).hexdigest()
|
|
190
|
+
|
|
191
|
+
# Thread-safe session management
|
|
192
|
+
with _SPARK_SESSION_LOCK:
|
|
193
|
+
# Check if we have a cached session with this config
|
|
194
|
+
if cache_key in _SPARK_SESSION_CACHE:
|
|
195
|
+
cached_spark = _SPARK_SESSION_CACHE[cache_key]
|
|
196
|
+
# Verify session is still active
|
|
197
|
+
try:
|
|
198
|
+
cached_spark.sparkContext.getConf() # Will fail if session is dead
|
|
199
|
+
return cached_spark
|
|
200
|
+
except Exception:
|
|
201
|
+
# Session died, remove from cache
|
|
202
|
+
del _SPARK_SESSION_CACHE[cache_key]
|
|
203
|
+
|
|
204
|
+
# v0.51.0: Stop any existing session with DIFFERENT config
|
|
205
|
+
# This ensures we get correct spark.jars.packages for this strategy
|
|
206
|
+
try:
|
|
207
|
+
existing = SparkSession.getActiveSession()
|
|
208
|
+
if existing:
|
|
209
|
+
existing.stop()
|
|
210
|
+
# Clear the global cache too
|
|
211
|
+
_SPARK_SESSION_CACHE.clear()
|
|
212
|
+
except Exception:
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
# DVT v0.5.3: Suppress Java/Spark startup warnings completely
|
|
216
|
+
# Create a custom log4j2 config to silence Spark startup noise
|
|
217
|
+
import tempfile
|
|
218
|
+
log4j_config = """
|
|
219
|
+
status = error
|
|
220
|
+
appender.console.type = Console
|
|
221
|
+
appender.console.name = console
|
|
222
|
+
appender.console.layout.type = PatternLayout
|
|
223
|
+
appender.console.layout.pattern = %msg%n
|
|
224
|
+
rootLogger.level = error
|
|
225
|
+
rootLogger.appenderRef.console.ref = console
|
|
226
|
+
logger.spark.name = org.apache.spark
|
|
227
|
+
logger.spark.level = error
|
|
228
|
+
logger.hadoop.name = org.apache.hadoop
|
|
229
|
+
logger.hadoop.level = error
|
|
230
|
+
"""
|
|
231
|
+
log4j_file = os.path.join(tempfile.gettempdir(), "dvt_log4j2.properties")
|
|
232
|
+
with open(log4j_file, "w") as f:
|
|
233
|
+
f.write(log4j_config)
|
|
234
|
+
|
|
235
|
+
# Use persistent JAR cache in project directory
|
|
236
|
+
dvt_home = os.path.expanduser("~/.dvt")
|
|
237
|
+
jar_cache_dir = os.path.join(dvt_home, "jdbc_jars")
|
|
238
|
+
os.makedirs(jar_cache_dir, exist_ok=True)
|
|
239
|
+
|
|
240
|
+
# DVT v0.5.3: Get cached JDBC jars (from project dir, not home dir)
|
|
241
|
+
jar_paths = self._get_jdbc_jars(jar_cache_dir)
|
|
242
|
+
|
|
243
|
+
builder = SparkSession.builder.appName(self.app_name)
|
|
244
|
+
|
|
245
|
+
# Use local[2] instead of local[*] for faster startup
|
|
246
|
+
master = self.config.get("master", "local[2]")
|
|
247
|
+
builder = builder.master(master)
|
|
248
|
+
|
|
249
|
+
# Optimized default configurations for SPEED
|
|
250
|
+
fast_configs = {
|
|
251
|
+
# Memory optimization
|
|
252
|
+
"spark.driver.memory": "1g",
|
|
253
|
+
"spark.executor.memory": "1g",
|
|
254
|
+
|
|
255
|
+
# DVT v0.5.3: Use direct JAR paths (NO Ivy output!)
|
|
256
|
+
"spark.jars": ",".join(jar_paths) if jar_paths else "",
|
|
257
|
+
|
|
258
|
+
# DVT v0.58.5: Add JARs to classpath for JDBC driver loading
|
|
259
|
+
"spark.driver.extraClassPath": ":".join(jar_paths) if jar_paths else "",
|
|
260
|
+
"spark.executor.extraClassPath": ":".join(jar_paths) if jar_paths else "",
|
|
261
|
+
|
|
262
|
+
# DVT v0.58.5: Java 21 compatibility flags for PySpark 4.0
|
|
263
|
+
# Minimal JVM options - don't restrict modules (causes Spark failures)
|
|
264
|
+
"spark.driver.extraJavaOptions": " ".join([
|
|
265
|
+
f"-Dlog4j2.configurationFile=file:{log4j_file}",
|
|
266
|
+
"-Djava.util.logging.level=SEVERE",
|
|
267
|
+
# Java module system compatibility for Spark
|
|
268
|
+
"--add-opens=java.base/java.lang=ALL-UNNAMED",
|
|
269
|
+
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
|
|
270
|
+
"--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
|
|
271
|
+
"--add-opens=java.base/java.io=ALL-UNNAMED",
|
|
272
|
+
"--add-opens=java.base/java.net=ALL-UNNAMED",
|
|
273
|
+
"--add-opens=java.base/java.nio=ALL-UNNAMED",
|
|
274
|
+
"--add-opens=java.base/java.util=ALL-UNNAMED",
|
|
275
|
+
"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
|
|
276
|
+
"--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
|
|
277
|
+
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
|
|
278
|
+
"--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
|
|
279
|
+
"--add-opens=java.base/sun.security.action=ALL-UNNAMED",
|
|
280
|
+
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED",
|
|
281
|
+
"--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED",
|
|
282
|
+
"-XX:+IgnoreUnrecognizedVMOptions",
|
|
283
|
+
]),
|
|
284
|
+
|
|
285
|
+
# Suppress Spark UI and progress
|
|
286
|
+
"spark.ui.enabled": "false",
|
|
287
|
+
"spark.ui.showConsoleProgress": "false",
|
|
288
|
+
"spark.eventLog.enabled": "false",
|
|
289
|
+
|
|
290
|
+
# Network optimizations
|
|
291
|
+
"spark.driver.bindAddress": "127.0.0.1",
|
|
292
|
+
"spark.driver.host": "localhost",
|
|
293
|
+
|
|
294
|
+
# Reduce shuffle partitions for faster queries on small data
|
|
295
|
+
"spark.sql.shuffle.partitions": "8",
|
|
296
|
+
|
|
297
|
+
# DVT v0.58.4: Disable Arrow temporarily to avoid segfaults on macOS + Java 21
|
|
298
|
+
# Arrow's native code can cause segfaults during Spark session creation
|
|
299
|
+
"spark.sql.execution.arrow.pyspark.enabled": "false",
|
|
300
|
+
"spark.sql.execution.arrow.pyspark.fallback.enabled": "true",
|
|
301
|
+
"spark.sql.execution.arrow.enabled": "false",
|
|
302
|
+
|
|
303
|
+
# Disable adaptive optimization (slow for small data)
|
|
304
|
+
"spark.sql.adaptive.enabled": "false",
|
|
305
|
+
"spark.sql.adaptive.coalescePartitions.enabled": "false",
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
# Apply fast configs (can be overridden by user config)
|
|
309
|
+
for key, value in fast_configs.items():
|
|
310
|
+
if key not in self.config:
|
|
311
|
+
builder = builder.config(key, value)
|
|
312
|
+
|
|
313
|
+
# Apply user-provided configs (except 'master' which is already set)
|
|
314
|
+
for key, value in self.config.items():
|
|
315
|
+
if key != "master":
|
|
316
|
+
builder = builder.config(key, value)
|
|
317
|
+
|
|
318
|
+
# Create Spark session
|
|
319
|
+
spark = builder.getOrCreate()
|
|
320
|
+
|
|
321
|
+
# Set log level to ERROR to suppress Spark warnings
|
|
322
|
+
spark.sparkContext.setLogLevel("ERROR")
|
|
323
|
+
|
|
324
|
+
# Cache the session for reuse (thread-safe)
|
|
325
|
+
with _SPARK_SESSION_LOCK:
|
|
326
|
+
_SPARK_SESSION_CACHE[cache_key] = spark
|
|
327
|
+
|
|
328
|
+
return spark
|
|
329
|
+
|
|
330
|
+
def _get_jdbc_jars(self, cache_dir: str) -> list:
|
|
331
|
+
"""
|
|
332
|
+
Discover ALL JDBC JAR files from project cache at runtime.
|
|
333
|
+
|
|
334
|
+
v0.5.96: Dynamic discovery - finds all *.jar files in .dvt/jdbc_jars/
|
|
335
|
+
This enables project folder portability (move folder → JARs still work).
|
|
336
|
+
|
|
337
|
+
JARs are downloaded via 'dvt target sync' command.
|
|
338
|
+
|
|
339
|
+
:param cache_dir: Directory to look for JAR files (ignored, uses project dir)
|
|
340
|
+
:returns: List of JAR file absolute paths
|
|
341
|
+
"""
|
|
342
|
+
import glob
|
|
343
|
+
|
|
344
|
+
# Look for JARs in project directory (current working directory)
|
|
345
|
+
project_dir = os.getcwd()
|
|
346
|
+
jar_cache_dir = os.path.join(project_dir, ".dvt", "jdbc_jars")
|
|
347
|
+
|
|
348
|
+
# Discover ALL *.jar files dynamically (not hardcoded list)
|
|
349
|
+
jar_pattern = os.path.join(jar_cache_dir, "*.jar")
|
|
350
|
+
jar_paths = sorted(glob.glob(jar_pattern))
|
|
351
|
+
|
|
352
|
+
# No warning needed - clean output
|
|
353
|
+
# User should run 'dvt target sync' if JARs needed
|
|
354
|
+
|
|
355
|
+
return jar_paths
|
|
356
|
+
|
|
357
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
358
|
+
"""
|
|
359
|
+
Close Spark session after execution.
|
|
360
|
+
|
|
361
|
+
By default, closes the session to free resources and prevent blocking other models.
|
|
362
|
+
Session caching can be enabled by setting DVT_SPARK_KEEP_ALIVE=1 for faster
|
|
363
|
+
consecutive runs within the same Python process.
|
|
364
|
+
|
|
365
|
+
Set DVT_SPARK_KEEP_ALIVE=1 environment variable to keep sessions alive (advanced).
|
|
366
|
+
|
|
367
|
+
:param spark: SparkSession to close (or optionally keep alive)
|
|
368
|
+
"""
|
|
369
|
+
import os
|
|
370
|
+
|
|
371
|
+
# Check if caching is enabled (opt-in, not default)
|
|
372
|
+
keep_alive = os.environ.get("DVT_SPARK_KEEP_ALIVE", "0") == "1"
|
|
373
|
+
|
|
374
|
+
if keep_alive:
|
|
375
|
+
# DVT v0.4.8: Suppressed verbose output
|
|
376
|
+
# Session stays alive in cache for reuse (opt-in)
|
|
377
|
+
# print("[DVT] Spark session kept alive in cache (DVT_SPARK_KEEP_ALIVE=1)", flush=True)
|
|
378
|
+
pass
|
|
379
|
+
elif spark:
|
|
380
|
+
try:
|
|
381
|
+
# Clear from cache first (thread-safe)
|
|
382
|
+
with _SPARK_SESSION_LOCK:
|
|
383
|
+
for key, cached_spark in list(_SPARK_SESSION_CACHE.items()):
|
|
384
|
+
if cached_spark is spark:
|
|
385
|
+
del _SPARK_SESSION_CACHE[key]
|
|
386
|
+
break
|
|
387
|
+
|
|
388
|
+
# Stop the session
|
|
389
|
+
spark.stop()
|
|
390
|
+
# DVT v0.4.8: Suppressed verbose output
|
|
391
|
+
# print("[DVT] ✓ Spark session closed", flush=True)
|
|
392
|
+
except Exception:
|
|
393
|
+
pass # Best effort cleanup
|
|
394
|
+
|
|
395
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
396
|
+
"""
|
|
397
|
+
Estimate cost for local execution.
|
|
398
|
+
|
|
399
|
+
Local execution is free (runs on local machine).
|
|
400
|
+
|
|
401
|
+
:param duration_minutes: Estimated query duration
|
|
402
|
+
:returns: 0.0 (free)
|
|
403
|
+
"""
|
|
404
|
+
return 0.0
|
|
405
|
+
|
|
406
|
+
def get_platform_name(self) -> str:
|
|
407
|
+
"""Get platform name."""
|
|
408
|
+
return "local"
|
|
409
|
+
|
|
410
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
411
|
+
"""
|
|
412
|
+
Get Spark config for JDBC JAR provisioning using local file paths.
|
|
413
|
+
|
|
414
|
+
Local Spark uses spark.jars with local file paths from .dvt/jdbc_jars/
|
|
415
|
+
for instant startup (no download at runtime).
|
|
416
|
+
|
|
417
|
+
:param adapter_types: Set of adapter types (ignored - uses all JARs found)
|
|
418
|
+
:returns: Dictionary with spark.jars config
|
|
419
|
+
"""
|
|
420
|
+
from dbt.compute.jar_provisioning import LocalJARProvisioning
|
|
421
|
+
|
|
422
|
+
provisioning = LocalJARProvisioning(project_dir=os.getcwd())
|
|
423
|
+
return provisioning.get_spark_config(adapter_types)
|
|
424
|
+
|
|
425
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
426
|
+
"""
|
|
427
|
+
Test connectivity by creating a local Spark session.
|
|
428
|
+
|
|
429
|
+
:returns: Tuple of (success, message)
|
|
430
|
+
"""
|
|
431
|
+
# Check PySpark at runtime (not module import time)
|
|
432
|
+
try:
|
|
433
|
+
from pyspark.sql import SparkSession as _ # noqa: F401
|
|
434
|
+
except ImportError:
|
|
435
|
+
return (False, "PySpark not installed")
|
|
436
|
+
|
|
437
|
+
try:
|
|
438
|
+
spark = self.get_spark_session()
|
|
439
|
+
# Run simple SQL to verify
|
|
440
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
441
|
+
return (True, "Local Spark session created and SQL test passed")
|
|
442
|
+
except Exception as e:
|
|
443
|
+
return (False, f"Local Spark failed: {e}")
|
|
Binary file
|