dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
@@ -0,0 +1,364 @@
1
+ """
2
+ Local Spark Connection Strategy
3
+
4
+ Provides embedded PySpark session for local development and testing.
5
+ This is the default strategy extracted from the original SparkEngine implementation.
6
+
7
+ Includes auto-configuration of Java with PySpark compatibility checking.
8
+
9
+ v0.51.3: Refactored to use java_compat module for centralized Java/PySpark compatibility.
10
+ v0.5.98: Added JAR provisioning using local file paths (spark.jars).
11
+ """
12
+
13
+ import os
14
+ from typing import Dict, Optional, Set, Tuple
15
+
16
+ from dbt.compute.strategies.base import BaseConnectionStrategy
17
+ from dbt_common.exceptions import DbtRuntimeError
18
+
19
+ try:
20
+ from pyspark.sql import SparkSession
21
+
22
+ PYSPARK_AVAILABLE = True
23
+ except ImportError:
24
+ PYSPARK_AVAILABLE = False
25
+ SparkSession = None
26
+
27
+ # Global Spark session cache for reuse across calls (within same process)
28
+ _SPARK_SESSION_CACHE = {}
29
+
30
+
31
+ def _ensure_java_available():
32
+ """
33
+ Ensure Java is available and compatible with installed PySpark.
34
+
35
+ Uses the centralized java_compat module for cross-platform Java detection
36
+ and PySpark compatibility checking.
37
+
38
+ v0.51.3: Refactored to use java_compat module with enhanced compatibility checking.
39
+ Always sets JAVA_HOME to a proper JDK path (not /usr or invalid paths).
40
+ """
41
+ from dbt.compute.java_compat import (
42
+ get_pyspark_info,
43
+ find_all_java_installations,
44
+ select_best_java,
45
+ )
46
+
47
+ # Get PySpark requirements
48
+ pyspark = get_pyspark_info()
49
+ if not pyspark:
50
+ raise DbtRuntimeError(
51
+ "PySpark is not installed. Install it with: pip install pyspark\n"
52
+ "Or run 'dvt spark set-version' to select a specific version."
53
+ )
54
+
55
+ # Always search for Java installations and select the best one
56
+ # This ensures JAVA_HOME is set to a proper JDK path (not /usr or invalid)
57
+ all_java = find_all_java_installations()
58
+ best_java = select_best_java(all_java, pyspark.java_supported)
59
+
60
+ if best_java:
61
+ # Set JAVA_HOME to the best compatible Java found
62
+ # This is needed even if Java is in PATH because PySpark's scripts
63
+ # rely on JAVA_HOME being set to a proper JDK directory
64
+ os.environ["JAVA_HOME"] = best_java.path
65
+ bin_path = os.path.join(best_java.path, "bin")
66
+ # Prepend to PATH to ensure this Java is used
67
+ os.environ["PATH"] = bin_path + os.pathsep + os.environ.get("PATH", "")
68
+ return
69
+
70
+ # No compatible Java found - show error with guidance
71
+ supported_str = ", ".join(str(v) for v in pyspark.java_supported)
72
+ raise DbtRuntimeError(
73
+ f"No compatible Java found for PySpark {pyspark.version}.\n"
74
+ f"PySpark {pyspark.major_minor} requires Java {supported_str}.\n\n"
75
+ f"Run 'dvt java search' to find Java installations.\n"
76
+ f"Run 'dvt java set' to select a compatible version.\n"
77
+ f"Run 'dvt java install' for installation guide."
78
+ )
79
+
80
+
81
+ class LocalStrategy(BaseConnectionStrategy):
82
+ """
83
+ Local embedded Spark strategy.
84
+
85
+ Creates an in-process PySpark session with local[*] master.
86
+ Best for development, testing, and small-medium workloads.
87
+
88
+ Configuration:
89
+ {
90
+ "master": "local[*]", # optional, defaults to local[*]
91
+ "spark.driver.memory": "4g", # optional
92
+ "spark.executor.memory": "4g", # optional
93
+ # ... any other Spark configs
94
+ }
95
+ """
96
+
97
+ def validate_config(self) -> None:
98
+ """
99
+ Validate local strategy configuration.
100
+
101
+ Local strategy is flexible - no required fields.
102
+ """
103
+ # Local strategy accepts any config - very flexible
104
+ # Just ensure it's a dictionary
105
+ if not isinstance(self.config, dict):
106
+ raise DbtRuntimeError(
107
+ f"Local Spark config must be a dictionary, got {type(self.config)}"
108
+ )
109
+
110
+ def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
111
+ """
112
+ Create or reuse local Spark session (BLAZING FAST).
113
+
114
+ Creates an embedded PySpark session with optimized configuration for speed.
115
+ Implements session caching to reuse existing sessions.
116
+
117
+ DVT v0.5.3: Uses direct JAR paths instead of spark.jars.packages to avoid
118
+ verbose Ivy output. JARs are downloaded once and cached in ~/.dvt/jdbc_jars/
119
+
120
+ :param adapter_types: Set of adapter types that need JDBC drivers (optional, for API compatibility)
121
+ :returns: Initialized SparkSession
122
+ :raises DbtRuntimeError: If session creation fails
123
+ """
124
+ import sys
125
+ import hashlib
126
+
127
+ if not PYSPARK_AVAILABLE:
128
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
129
+
130
+ # Auto-configure Java first
131
+ _ensure_java_available()
132
+
133
+ # Create cache key from config to reuse sessions with same configuration
134
+ config_str = str(sorted(self.config.items()))
135
+ cache_key = hashlib.md5(config_str.encode()).hexdigest()
136
+
137
+ # Check if we have a cached session with this config
138
+ if cache_key in _SPARK_SESSION_CACHE:
139
+ cached_spark = _SPARK_SESSION_CACHE[cache_key]
140
+ # Verify session is still active
141
+ try:
142
+ cached_spark.sparkContext.getConf() # Will fail if session is dead
143
+ return cached_spark
144
+ except Exception:
145
+ # Session died, remove from cache
146
+ del _SPARK_SESSION_CACHE[cache_key]
147
+
148
+ # v0.51.0: Stop any existing session with DIFFERENT config
149
+ # This ensures we get correct spark.jars.packages for this strategy
150
+ try:
151
+ existing = SparkSession.getActiveSession()
152
+ if existing:
153
+ existing.stop()
154
+ # Clear the global cache too
155
+ _SPARK_SESSION_CACHE.clear()
156
+ except Exception:
157
+ pass
158
+
159
+ # DVT v0.5.3: Suppress Java/Spark startup warnings completely
160
+ # Create a custom log4j2 config to silence Spark startup noise
161
+ import tempfile
162
+ log4j_config = """
163
+ status = error
164
+ appender.console.type = Console
165
+ appender.console.name = console
166
+ appender.console.layout.type = PatternLayout
167
+ appender.console.layout.pattern = %msg%n
168
+ rootLogger.level = error
169
+ rootLogger.appenderRef.console.ref = console
170
+ logger.spark.name = org.apache.spark
171
+ logger.spark.level = error
172
+ logger.hadoop.name = org.apache.hadoop
173
+ logger.hadoop.level = error
174
+ """
175
+ log4j_file = os.path.join(tempfile.gettempdir(), "dvt_log4j2.properties")
176
+ with open(log4j_file, "w") as f:
177
+ f.write(log4j_config)
178
+
179
+ # Use persistent JAR cache in project directory
180
+ dvt_home = os.path.expanduser("~/.dvt")
181
+ jar_cache_dir = os.path.join(dvt_home, "jdbc_jars")
182
+ os.makedirs(jar_cache_dir, exist_ok=True)
183
+
184
+ # DVT v0.5.3: Get cached JDBC jars (from project dir, not home dir)
185
+ jar_paths = self._get_jdbc_jars(jar_cache_dir)
186
+
187
+ builder = SparkSession.builder.appName(self.app_name)
188
+
189
+ # Use local[2] instead of local[*] for faster startup
190
+ master = self.config.get("master", "local[2]")
191
+ builder = builder.master(master)
192
+
193
+ # Optimized default configurations for SPEED
194
+ fast_configs = {
195
+ # Memory optimization
196
+ "spark.driver.memory": "1g",
197
+ "spark.executor.memory": "1g",
198
+
199
+ # DVT v0.5.3: Use direct JAR paths (NO Ivy output!)
200
+ "spark.jars": ",".join(jar_paths) if jar_paths else "",
201
+
202
+ # DVT v0.5.3: Suppress ALL Java/Spark warnings
203
+ "spark.driver.extraJavaOptions": " ".join([
204
+ f"-Dlog4j2.configurationFile=file:{log4j_file}",
205
+ "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
206
+ "-Djava.util.logging.level=SEVERE",
207
+ ]),
208
+
209
+ # Suppress Spark UI and progress
210
+ "spark.ui.enabled": "false",
211
+ "spark.ui.showConsoleProgress": "false",
212
+ "spark.eventLog.enabled": "false",
213
+
214
+ # Network optimizations
215
+ "spark.driver.bindAddress": "127.0.0.1",
216
+ "spark.driver.host": "localhost",
217
+
218
+ # Reduce shuffle partitions for faster queries on small data
219
+ "spark.sql.shuffle.partitions": "8",
220
+
221
+ # Enable Arrow for efficient data transfer
222
+ "spark.sql.execution.arrow.pyspark.enabled": "true",
223
+ "spark.sql.execution.arrow.pyspark.fallback.enabled": "true",
224
+ "spark.sql.execution.arrow.enabled": "true",
225
+
226
+ # Disable adaptive optimization (slow for small data)
227
+ "spark.sql.adaptive.enabled": "false",
228
+ "spark.sql.adaptive.coalescePartitions.enabled": "false",
229
+ }
230
+
231
+ # Apply fast configs (can be overridden by user config)
232
+ for key, value in fast_configs.items():
233
+ if key not in self.config:
234
+ builder = builder.config(key, value)
235
+
236
+ # Apply user-provided configs (except 'master' which is already set)
237
+ for key, value in self.config.items():
238
+ if key != "master":
239
+ builder = builder.config(key, value)
240
+
241
+ # Create Spark session
242
+ spark = builder.getOrCreate()
243
+
244
+ # Set log level to ERROR to suppress Spark warnings
245
+ spark.sparkContext.setLogLevel("ERROR")
246
+
247
+ # Cache the session for reuse
248
+ _SPARK_SESSION_CACHE[cache_key] = spark
249
+
250
+ return spark
251
+
252
+ def _get_jdbc_jars(self, cache_dir: str) -> list:
253
+ """
254
+ Discover ALL JDBC JAR files from project cache at runtime.
255
+
256
+ v0.5.96: Dynamic discovery - finds all *.jar files in .dvt/jdbc_jars/
257
+ This enables project folder portability (move folder → JARs still work).
258
+
259
+ JARs are downloaded via 'dvt target sync' command.
260
+
261
+ :param cache_dir: Directory to look for JAR files (ignored, uses project dir)
262
+ :returns: List of JAR file absolute paths
263
+ """
264
+ import glob
265
+
266
+ # Look for JARs in project directory (current working directory)
267
+ project_dir = os.getcwd()
268
+ jar_cache_dir = os.path.join(project_dir, ".dvt", "jdbc_jars")
269
+
270
+ # Discover ALL *.jar files dynamically (not hardcoded list)
271
+ jar_pattern = os.path.join(jar_cache_dir, "*.jar")
272
+ jar_paths = sorted(glob.glob(jar_pattern))
273
+
274
+ # No warning needed - clean output
275
+ # User should run 'dvt target sync' if JARs needed
276
+
277
+ return jar_paths
278
+
279
+ def close(self, spark: Optional[SparkSession]) -> None:
280
+ """
281
+ Close Spark session after execution.
282
+
283
+ By default, closes the session to free resources and prevent blocking other models.
284
+ Session caching can be enabled by setting DVT_SPARK_KEEP_ALIVE=1 for faster
285
+ consecutive runs within the same Python process.
286
+
287
+ Set DVT_SPARK_KEEP_ALIVE=1 environment variable to keep sessions alive (advanced).
288
+
289
+ :param spark: SparkSession to close (or optionally keep alive)
290
+ """
291
+ import os
292
+
293
+ # Check if caching is enabled (opt-in, not default)
294
+ keep_alive = os.environ.get("DVT_SPARK_KEEP_ALIVE", "0") == "1"
295
+
296
+ if keep_alive:
297
+ # DVT v0.4.8: Suppressed verbose output
298
+ # Session stays alive in cache for reuse (opt-in)
299
+ # print("[DVT] Spark session kept alive in cache (DVT_SPARK_KEEP_ALIVE=1)", flush=True)
300
+ pass
301
+ elif spark:
302
+ try:
303
+ # Clear from cache first
304
+ for key, cached_spark in list(_SPARK_SESSION_CACHE.items()):
305
+ if cached_spark is spark:
306
+ del _SPARK_SESSION_CACHE[key]
307
+ break
308
+
309
+ # Stop the session
310
+ spark.stop()
311
+ # DVT v0.4.8: Suppressed verbose output
312
+ # print("[DVT] ✓ Spark session closed", flush=True)
313
+ except Exception:
314
+ pass # Best effort cleanup
315
+
316
+ def estimate_cost(self, duration_minutes: float) -> float:
317
+ """
318
+ Estimate cost for local execution.
319
+
320
+ Local execution is free (runs on local machine).
321
+
322
+ :param duration_minutes: Estimated query duration
323
+ :returns: 0.0 (free)
324
+ """
325
+ return 0.0
326
+
327
+ def get_platform_name(self) -> str:
328
+ """Get platform name."""
329
+ return "local"
330
+
331
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
332
+ """
333
+ Get Spark config for JDBC JAR provisioning using local file paths.
334
+
335
+ Local Spark uses spark.jars with local file paths from .dvt/jdbc_jars/
336
+ for instant startup (no download at runtime).
337
+
338
+ :param adapter_types: Set of adapter types (ignored - uses all JARs found)
339
+ :returns: Dictionary with spark.jars config
340
+ """
341
+ from dbt.compute.jar_provisioning import LocalJARProvisioning
342
+
343
+ provisioning = LocalJARProvisioning(project_dir=os.getcwd())
344
+ return provisioning.get_spark_config(adapter_types)
345
+
346
+ def test_connectivity(self) -> Tuple[bool, str]:
347
+ """
348
+ Test connectivity by creating a local Spark session.
349
+
350
+ :returns: Tuple of (success, message)
351
+ """
352
+ # Check PySpark at runtime (not module import time)
353
+ try:
354
+ from pyspark.sql import SparkSession as _ # noqa: F401
355
+ except ImportError:
356
+ return (False, "PySpark not installed")
357
+
358
+ try:
359
+ spark = self.get_spark_session()
360
+ # Run simple SQL to verify
361
+ spark.sql("SELECT 1 AS test").collect()
362
+ return (True, "Local Spark session created and SQL test passed")
363
+ except Exception as e:
364
+ return (False, f"Local Spark failed: {e}")
@@ -0,0 +1,262 @@
1
+ """
2
+ Standalone Spark Cluster Connection Strategy
3
+
4
+ Provides connection to self-managed Spark clusters (on-premises or cloud VMs).
5
+
6
+ v0.5.98: New strategy for standalone Spark clusters with Maven-based JAR provisioning.
7
+ Fixes the bug where external clusters incorrectly fell back to LocalStrategy
8
+ with local JAR paths that don't exist on remote workers.
9
+
10
+ Configuration:
11
+ {
12
+ "master": "spark://master-node:7077", # Required: Spark master URL
13
+ "spark.driver.memory": "4g", # Optional: driver memory
14
+ "spark.executor.memory": "8g", # Optional: executor memory
15
+ "spark.executor.cores": "4", # Optional: cores per executor
16
+ "spark.executor.instances": "10", # Optional: number of executors
17
+ }
18
+
19
+ Requirements:
20
+ - Standalone Spark cluster must be running
21
+ - Spark master must be accessible from client machine
22
+ - Workers must have network access to Maven Central (for JAR downloads)
23
+ """
24
+
25
+ from typing import Any, Dict, Optional, Set, Tuple
26
+
27
+ from dbt.compute.strategies.base import BaseConnectionStrategy
28
+ from dbt_common.exceptions import DbtRuntimeError
29
+
30
+ try:
31
+ from pyspark.sql import SparkSession
32
+
33
+ PYSPARK_AVAILABLE = True
34
+ except ImportError:
35
+ PYSPARK_AVAILABLE = False
36
+ SparkSession = None
37
+
38
+
39
+ class StandaloneStrategy(BaseConnectionStrategy):
40
+ """
41
+ Standalone Spark cluster connection strategy.
42
+
43
+ Connects to self-managed Spark clusters using spark:// master URL.
44
+ Uses spark.jars.packages for JDBC JAR provisioning so workers can
45
+ download drivers from Maven Central.
46
+ """
47
+
48
+ def validate_config(self) -> None:
49
+ """
50
+ Validate Standalone strategy configuration.
51
+
52
+ Required:
53
+ - master: Must start with "spark://" for standalone clusters
54
+
55
+ :raises DbtRuntimeError: If configuration is invalid
56
+ """
57
+ if not isinstance(self.config, dict):
58
+ raise DbtRuntimeError(
59
+ f"Standalone config must be a dictionary, got {type(self.config)}"
60
+ )
61
+
62
+ # Check master format
63
+ master = self.config.get("master", "")
64
+ if not master.startswith("spark://"):
65
+ raise DbtRuntimeError(
66
+ f"Standalone config requires master to start with 'spark://', got: {master}"
67
+ )
68
+
69
+ def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
70
+ """
71
+ Create Spark session connected to standalone cluster.
72
+
73
+ :param adapter_types: Set of adapter types that need JDBC drivers
74
+ :returns: Initialized SparkSession connected to standalone cluster
75
+ :raises DbtRuntimeError: If session creation fails
76
+ """
77
+ if not PYSPARK_AVAILABLE:
78
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
79
+
80
+ try:
81
+ # v0.51.0: Ensure Java is available
82
+ from dbt.compute.strategies.local import _ensure_java_available
83
+ _ensure_java_available()
84
+
85
+ # v0.51.0: Stop any existing session to ensure fresh config
86
+ existing = SparkSession.getActiveSession()
87
+ if existing:
88
+ existing.stop()
89
+
90
+ builder = SparkSession.builder.appName(self.app_name)
91
+
92
+ # Set master URL
93
+ master = self.config.get("master")
94
+ builder = builder.master(master)
95
+
96
+ # v0.5.99: Get JDBC JAR config (Maven coordinates for remote workers)
97
+ # Merge with user-provided spark.jars.packages instead of overwriting
98
+ if adapter_types is None:
99
+ from dbt.compute.jar_provisioning import get_required_adapter_types
100
+ adapter_types = get_required_adapter_types()
101
+
102
+ auto_packages = []
103
+ if adapter_types:
104
+ jar_config = self.get_jar_provisioning_config(adapter_types)
105
+ auto_packages_str = jar_config.get("spark.jars.packages", "")
106
+ if auto_packages_str:
107
+ auto_packages = [p.strip() for p in auto_packages_str.split(",") if p.strip()]
108
+
109
+ # Get user-provided packages from config
110
+ user_packages_str = self.config.get("spark.jars.packages", "")
111
+ user_packages = [p.strip() for p in user_packages_str.split(",") if p.strip()]
112
+
113
+ # Merge packages (user + auto-detected)
114
+ all_packages = list(set(user_packages + auto_packages))
115
+ if all_packages:
116
+ builder = builder.config("spark.jars.packages", ",".join(all_packages))
117
+
118
+ # Apply user-provided configs (except spark.jars.packages which we merged)
119
+ for key, value in self.config.items():
120
+ if key != "master" and key != "spark.jars.packages":
121
+ builder = builder.config(key, value)
122
+
123
+ # Default optimizations
124
+ default_configs = {
125
+ "spark.sql.execution.arrow.pyspark.enabled": "true",
126
+ "spark.sql.execution.arrow.pyspark.fallback.enabled": "true",
127
+ }
128
+ for key, value in default_configs.items():
129
+ if key not in self.config:
130
+ builder = builder.config(key, value)
131
+
132
+ # DVT v0.51.5: Auto-configure driver host for Docker Spark clusters
133
+ # When master is on localhost, workers (in Docker containers) need to reach
134
+ # the driver running on the host machine via host.docker.internal
135
+ if "spark.driver.host" not in self.config:
136
+ if "localhost" in master or "127.0.0.1" in master:
137
+ builder = builder.config("spark.driver.host", "host.docker.internal")
138
+
139
+ # Create session
140
+ spark = builder.getOrCreate()
141
+ spark.sparkContext.setLogLevel("WARN")
142
+
143
+ return spark
144
+
145
+ except Exception as e:
146
+ error_msg = str(e)
147
+ master = self.config.get("master", "unknown")
148
+ if "Connection refused" in error_msg:
149
+ raise DbtRuntimeError(
150
+ f"Cannot connect to Spark master at '{master}'. "
151
+ f"Ensure the cluster is running and accessible. Error: {error_msg}"
152
+ ) from e
153
+ raise DbtRuntimeError(f"Failed to create Standalone Spark session: {error_msg}") from e
154
+
155
+ def close(self, spark: Optional[SparkSession]) -> None:
156
+ """
157
+ Clean up Spark session.
158
+
159
+ For standalone clusters, we stop the application but the cluster continues running.
160
+
161
+ :param spark: SparkSession to clean up
162
+ """
163
+ if spark:
164
+ try:
165
+ spark.stop()
166
+ except Exception:
167
+ pass # Best effort cleanup
168
+
169
+ def estimate_cost(self, duration_minutes: float) -> float:
170
+ """
171
+ Estimate cost for standalone cluster execution.
172
+
173
+ For self-managed clusters, returns 0.0 as cost depends on infrastructure.
174
+
175
+ :param duration_minutes: Estimated query duration in minutes
176
+ :returns: 0.0 (infrastructure cost varies)
177
+ """
178
+ # Self-managed clusters have variable cost based on infrastructure
179
+ return 0.0
180
+
181
+ def get_platform_name(self) -> str:
182
+ """Get platform name."""
183
+ return "standalone"
184
+
185
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
186
+ """
187
+ Get Spark config for JDBC JAR provisioning using Maven coordinates.
188
+
189
+ Standalone clusters need spark.jars.packages so workers can download
190
+ JDBC drivers from Maven Central. Local file paths don't work because
191
+ they're not available on remote worker nodes.
192
+
193
+ :param adapter_types: Set of adapter types that need JDBC drivers
194
+ :returns: Dictionary with spark.jars.packages config
195
+ """
196
+ from dbt.compute.jar_provisioning import RemoteJARProvisioning
197
+
198
+ provisioning = RemoteJARProvisioning()
199
+ return provisioning.get_spark_config(adapter_types)
200
+
201
+ def test_connectivity(self) -> Tuple[bool, str]:
202
+ """
203
+ Test connectivity to standalone Spark cluster.
204
+
205
+ v0.51.1: Added timeout to prevent hanging when workers unavailable.
206
+ v0.51.8: Increased timeout to 90s for Docker clusters (JDBC JAR download time).
207
+
208
+ :returns: Tuple of (success, message)
209
+ """
210
+ if not PYSPARK_AVAILABLE:
211
+ return (False, "PySpark not installed")
212
+
213
+ import concurrent.futures
214
+
215
+ master = self.config.get("master", "unknown")
216
+
217
+ def _run_test():
218
+ spark = self.get_spark_session()
219
+ spark.sql("SELECT 1 AS test").collect()
220
+ return True
221
+
222
+ try:
223
+ # Use ThreadPoolExecutor with timeout to prevent hanging
224
+ # when workers aren't available
225
+ # v0.51.8: Increased from 30s to 90s - Docker Spark clusters need time
226
+ # for JDBC JAR downloads from Maven on first run
227
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
228
+ future = executor.submit(_run_test)
229
+ try:
230
+ result = future.result(timeout=90) # 90 second timeout for JAR downloads
231
+ return (True, "Standalone cluster session created and SQL test passed")
232
+ except concurrent.futures.TimeoutError:
233
+ return (False,
234
+ f"Timeout (90s): Workers not responding at '{master}'.\n"
235
+ f"Check: cluster workers are running, network access from driver to workers.\n"
236
+ f"Note: First run may take longer due to JDBC JAR downloads."
237
+ )
238
+
239
+ except Exception as e:
240
+ error_msg = str(e)
241
+ if "Connection refused" in error_msg:
242
+ return (False, f"Cannot connect to Spark master at '{master}'")
243
+ if "Initial job has not accepted any resources" in error_msg:
244
+ return (False,
245
+ f"Workers not accepting tasks at '{master}'.\n"
246
+ f"Check: spark.driver.host is set correctly for your network topology"
247
+ )
248
+ return (False, f"Standalone connection failed: {e}")
249
+
250
+ def get_cluster_info(self) -> Dict[str, Any]:
251
+ """
252
+ Get information about the standalone cluster configuration.
253
+
254
+ :returns: Dictionary with cluster metadata
255
+ """
256
+ return {
257
+ "platform": "standalone",
258
+ "master": self.config.get("master", "unknown"),
259
+ "executor_instances": self.config.get("spark.executor.instances", "dynamic"),
260
+ "executor_memory": self.config.get("spark.executor.memory", "default"),
261
+ "executor_cores": self.config.get("spark.executor.cores", "default"),
262
+ }
dbt/config/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ # all these are just exports, they need "noqa" so flake8 will not complain.
2
+ from .profile import Profile # noqa
3
+ from .project import IsFQNResource, PartialProject, Project # noqa
4
+ from .runtime import RuntimeConfig # noqa