dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
@@ -0,0 +1,472 @@
1
+ """
2
+ Local Spark Connection Strategy
3
+
4
+ Provides embedded PySpark session for local development and testing.
5
+ This is the default strategy extracted from the original SparkEngine implementation.
6
+
7
+ Includes auto-configuration of Java with PySpark compatibility checking.
8
+
9
+ v0.51.3: Refactored to use java_compat module for centralized Java/PySpark compatibility.
10
+ v0.5.98: Added JAR provisioning using local file paths (spark.jars).
11
+ v0.58.5: Fixed Java 21 segfaults by NOT loading jdk.incubator.vector module.
12
+ """
13
+
14
+ import os
15
+ from typing import Dict, Optional, Set, Tuple
16
+
17
+ from dbt.compute.strategies.base import BaseConnectionStrategy
18
+ from dbt_common.exceptions import DbtRuntimeError
19
+
20
+ try:
21
+ from pyspark.sql import SparkSession
22
+
23
+ PYSPARK_AVAILABLE = True
24
+ except ImportError:
25
+ PYSPARK_AVAILABLE = False
26
+ SparkSession = None
27
+
28
+ # Global Spark session cache for reuse across calls (within same process)
29
+ _SPARK_SESSION_CACHE = {}
30
+
31
+ # Thread lock for safe session management
32
+ import threading
33
+ _SPARK_SESSION_LOCK = threading.Lock()
34
+
35
+
36
+ def cleanup_all_spark_sessions():
37
+ """
38
+ Clean up ALL cached Spark sessions.
39
+
40
+ DVT v0.58.4: Call this at the end of runs to prevent semaphore leaks
41
+ and segfaults when the thread pool terminates.
42
+
43
+ Thread-safe - uses lock for cache access.
44
+ """
45
+ global _SPARK_SESSION_CACHE
46
+
47
+ with _SPARK_SESSION_LOCK:
48
+ for cache_key, spark in list(_SPARK_SESSION_CACHE.items()):
49
+ try:
50
+ spark.stop()
51
+ except Exception:
52
+ pass # Best effort cleanup
53
+ _SPARK_SESSION_CACHE.clear()
54
+
55
+
56
+ def _disable_multiprocessing_resource_tracker():
57
+ """
58
+ Disable Python's multiprocessing resource tracker to prevent segfaults.
59
+
60
+ DVT v0.58.5: PySpark 4.0 + Java 21 creates semaphores that conflict with
61
+ Python's resource tracker during shutdown, causing segfaults on macOS.
62
+ Disabling the tracker prevents these conflicts.
63
+ """
64
+ import multiprocessing
65
+ try:
66
+ # Disable resource tracking for semaphores
67
+ from multiprocessing import resource_tracker
68
+ # Replace the tracker's main function with a no-op
69
+ resource_tracker._resource_tracker = None
70
+ resource_tracker._fd = None
71
+ except Exception:
72
+ pass # Best effort - if it fails, continue anyway
73
+
74
+
75
+ def _ensure_java_available():
76
+ """
77
+ Ensure Java is available and compatible with installed PySpark.
78
+
79
+ Uses the centralized java_compat module for cross-platform Java detection
80
+ and PySpark compatibility checking.
81
+
82
+ v0.51.3: Refactored to use java_compat module with enhanced compatibility checking.
83
+ Always sets JAVA_HOME to a proper JDK path (not /usr or invalid paths).
84
+ """
85
+ from dbt.compute.java_compat import (
86
+ get_pyspark_info,
87
+ find_all_java_installations,
88
+ select_best_java,
89
+ )
90
+
91
+ # Get PySpark requirements
92
+ pyspark = get_pyspark_info()
93
+ if not pyspark:
94
+ raise DbtRuntimeError(
95
+ "PySpark is not installed. Install it with: pip install pyspark\n"
96
+ "Or run 'dvt spark set-version' to select a specific version."
97
+ )
98
+
99
+ # Always search for Java installations and select the best one
100
+ # This ensures JAVA_HOME is set to a proper JDK path (not /usr or invalid)
101
+ all_java = find_all_java_installations()
102
+ best_java = select_best_java(all_java, pyspark.java_supported)
103
+
104
+ if best_java:
105
+ # Set JAVA_HOME to the best compatible Java found
106
+ # This is needed even if Java is in PATH because PySpark's scripts
107
+ # rely on JAVA_HOME being set to a proper JDK directory
108
+ os.environ["JAVA_HOME"] = best_java.path
109
+ bin_path = os.path.join(best_java.path, "bin")
110
+ # Prepend to PATH to ensure this Java is used
111
+ os.environ["PATH"] = bin_path + os.pathsep + os.environ.get("PATH", "")
112
+ return
113
+
114
+ # No compatible Java found - show error with guidance
115
+ supported_str = ", ".join(str(v) for v in pyspark.java_supported)
116
+ raise DbtRuntimeError(
117
+ f"No compatible Java found for PySpark {pyspark.version}.\n"
118
+ f"PySpark {pyspark.major_minor} requires Java {supported_str}.\n\n"
119
+ f"Run 'dvt java search' to find Java installations.\n"
120
+ f"Run 'dvt java set' to select a compatible version.\n"
121
+ f"Run 'dvt java install' for installation guide."
122
+ )
123
+
124
+
125
+ class LocalStrategy(BaseConnectionStrategy):
126
+ """
127
+ Local embedded Spark strategy.
128
+
129
+ Creates an in-process PySpark session with local[*] master.
130
+ Best for development, testing, and small-medium workloads.
131
+
132
+ Configuration:
133
+ {
134
+ "master": "local[*]", # optional, defaults to local[*]
135
+ "spark.driver.memory": "4g", # optional
136
+ "spark.executor.memory": "4g", # optional
137
+ # ... any other Spark configs
138
+ }
139
+ """
140
+
141
+ def validate_config(self) -> None:
142
+ """
143
+ Validate local strategy configuration.
144
+
145
+ Local strategy is flexible - no required fields.
146
+ """
147
+ # Local strategy accepts any config - very flexible
148
+ # Just ensure it's a dictionary
149
+ if not isinstance(self.config, dict):
150
+ raise DbtRuntimeError(
151
+ f"Local Spark config must be a dictionary, got {type(self.config)}"
152
+ )
153
+
154
+ def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
155
+ """
156
+ Create or reuse local Spark session (BLAZING FAST).
157
+
158
+ Creates an embedded PySpark session with optimized configuration for speed.
159
+ Implements session caching to reuse existing sessions.
160
+
161
+ DVT v0.5.3: Uses direct JAR paths instead of spark.jars.packages to avoid
162
+ verbose Ivy output. JARs are downloaded once and cached in ~/.dvt/jdbc_jars/
163
+
164
+ :param adapter_types: Set of adapter types that need JDBC drivers (optional, for API compatibility)
165
+ :returns: Initialized SparkSession
166
+ :raises DbtRuntimeError: If session creation fails
167
+ """
168
+ import sys
169
+ import hashlib
170
+
171
+ if not PYSPARK_AVAILABLE:
172
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
173
+
174
+ # DVT v0.58.5: Disable resource tracker BEFORE any JVM operations
175
+ # This prevents segfaults caused by semaphore cleanup conflicts
176
+ _disable_multiprocessing_resource_tracker()
177
+
178
+ # DVT v0.59.0a28: Remove _JAVA_OPTIONS to stop "Picked up" messages
179
+ # JVM options are passed via spark.driver.extraJavaOptions instead
180
+ # Only clear if we set it ourselves (check for our specific flags)
181
+ _java_opts = os.environ.get("_JAVA_OPTIONS", "")
182
+ if "-XX:-UseVectorCmov" in _java_opts:
183
+ # We set this previously, clear it to avoid output noise
184
+ os.environ.pop("_JAVA_OPTIONS", None)
185
+
186
+ # Auto-configure Java first
187
+ _ensure_java_available()
188
+
189
+ # Create cache key from config to reuse sessions with same configuration
190
+ config_str = str(sorted(self.config.items()))
191
+ cache_key = hashlib.md5(config_str.encode()).hexdigest()
192
+
193
+ # Thread-safe session management
194
+ with _SPARK_SESSION_LOCK:
195
+ # Check if we have a cached session with this config
196
+ if cache_key in _SPARK_SESSION_CACHE:
197
+ cached_spark = _SPARK_SESSION_CACHE[cache_key]
198
+ # Verify session is still active
199
+ try:
200
+ cached_spark.sparkContext.getConf() # Will fail if session is dead
201
+ return cached_spark
202
+ except Exception:
203
+ # Session died, remove from cache
204
+ del _SPARK_SESSION_CACHE[cache_key]
205
+
206
+ # v0.51.0: Stop any existing session with DIFFERENT config
207
+ # This ensures we get correct spark.jars.packages for this strategy
208
+ try:
209
+ existing = SparkSession.getActiveSession()
210
+ if existing:
211
+ existing.stop()
212
+ # Clear the global cache too
213
+ _SPARK_SESSION_CACHE.clear()
214
+ except Exception:
215
+ pass
216
+
217
+ # DVT v0.5.3: Suppress Java/Spark startup warnings completely
218
+ # Create a custom log4j2 config to silence Spark startup noise
219
+ import tempfile
220
+ log4j_config = """
221
+ status = error
222
+ appender.console.type = Console
223
+ appender.console.name = console
224
+ appender.console.layout.type = PatternLayout
225
+ appender.console.layout.pattern = %msg%n
226
+ rootLogger.level = error
227
+ rootLogger.appenderRef.console.ref = console
228
+ logger.spark.name = org.apache.spark
229
+ logger.spark.level = error
230
+ logger.hadoop.name = org.apache.hadoop
231
+ logger.hadoop.level = error
232
+ """
233
+ log4j_file = os.path.join(tempfile.gettempdir(), "dvt_log4j2.properties")
234
+ with open(log4j_file, "w") as f:
235
+ f.write(log4j_config)
236
+
237
+ # Use persistent JAR cache in project directory
238
+ dvt_home = os.path.expanduser("~/.dvt")
239
+ jar_cache_dir = os.path.join(dvt_home, "jdbc_jars")
240
+ os.makedirs(jar_cache_dir, exist_ok=True)
241
+
242
+ # DVT v0.5.3: Get cached JDBC jars (from project dir, not home dir)
243
+ jar_paths = self._get_jdbc_jars(jar_cache_dir)
244
+
245
+ builder = SparkSession.builder.appName(self.app_name)
246
+
247
+ # Use local[2] instead of local[*] for faster startup
248
+ master = self.config.get("master", "local[2]")
249
+ builder = builder.master(master)
250
+
251
+ # Optimized default configurations for SPEED
252
+ fast_configs = {
253
+ # Memory optimization
254
+ "spark.driver.memory": "1g",
255
+ "spark.executor.memory": "1g",
256
+
257
+ # DVT v0.5.3: Use direct JAR paths (NO Ivy output!)
258
+ "spark.jars": ",".join(jar_paths) if jar_paths else "",
259
+
260
+ # DVT v0.58.5: Add JARs to classpath for JDBC driver loading
261
+ "spark.driver.extraClassPath": ":".join(jar_paths) if jar_paths else "",
262
+ "spark.executor.extraClassPath": ":".join(jar_paths) if jar_paths else "",
263
+
264
+ # DVT v0.58.5: Java 21 compatibility flags for PySpark 4.0
265
+ # Minimal JVM options - don't restrict modules (causes Spark failures)
266
+ "spark.driver.extraJavaOptions": " ".join([
267
+ f"-Dlog4j2.configurationFile=file:{log4j_file}",
268
+ "-Djava.util.logging.level=SEVERE",
269
+ # Java module system compatibility for Spark
270
+ "--add-opens=java.base/java.lang=ALL-UNNAMED",
271
+ "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
272
+ "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
273
+ "--add-opens=java.base/java.io=ALL-UNNAMED",
274
+ "--add-opens=java.base/java.net=ALL-UNNAMED",
275
+ "--add-opens=java.base/java.nio=ALL-UNNAMED",
276
+ "--add-opens=java.base/java.util=ALL-UNNAMED",
277
+ "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
278
+ "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
279
+ "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
280
+ "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
281
+ "--add-opens=java.base/sun.security.action=ALL-UNNAMED",
282
+ "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED",
283
+ "--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED",
284
+ "-XX:+IgnoreUnrecognizedVMOptions",
285
+ ]),
286
+
287
+ # Suppress Spark UI and progress
288
+ "spark.ui.enabled": "false",
289
+ "spark.ui.showConsoleProgress": "false",
290
+ "spark.eventLog.enabled": "false",
291
+
292
+ # Network optimizations
293
+ "spark.driver.bindAddress": "127.0.0.1",
294
+ "spark.driver.host": "localhost",
295
+
296
+ # Reduce shuffle partitions for faster queries on small data
297
+ "spark.sql.shuffle.partitions": "8",
298
+
299
+ # DVT v0.58.4: Disable Arrow temporarily to avoid segfaults on macOS + Java 21
300
+ # Arrow's native code can cause segfaults during Spark session creation
301
+ "spark.sql.execution.arrow.pyspark.enabled": "false",
302
+ "spark.sql.execution.arrow.pyspark.fallback.enabled": "true",
303
+ "spark.sql.execution.arrow.enabled": "false",
304
+
305
+ # Disable adaptive optimization (slow for small data)
306
+ "spark.sql.adaptive.enabled": "false",
307
+ "spark.sql.adaptive.coalescePartitions.enabled": "false",
308
+ }
309
+
310
+ # Apply fast configs (can be overridden by user config)
311
+ for key, value in fast_configs.items():
312
+ if key not in self.config:
313
+ builder = builder.config(key, value)
314
+
315
+ # Apply user-provided configs (except 'master' which is already set)
316
+ for key, value in self.config.items():
317
+ if key != "master":
318
+ builder = builder.config(key, value)
319
+
320
+ # DVT v0.59.0: Suppress Java/Spark startup noise completely
321
+ # Java prints _JAVA_OPTIONS and module warnings to stderr before log4j2 loads
322
+ # We capture and discard these during session creation
323
+ import io
324
+ import contextlib
325
+
326
+ original_stderr = sys.stderr
327
+ sys.stderr = io.StringIO() # Capture stderr
328
+
329
+ try:
330
+ # Create Spark session
331
+ spark = builder.getOrCreate()
332
+
333
+ # Set log level to ERROR to suppress Spark warnings
334
+ spark.sparkContext.setLogLevel("ERROR")
335
+ finally:
336
+ # Restore stderr
337
+ captured = sys.stderr.getvalue()
338
+ sys.stderr = original_stderr
339
+
340
+ # Only print actual errors, not Java startup noise
341
+ for line in captured.split('\n'):
342
+ line = line.strip()
343
+ if line and not any(skip in line for skip in [
344
+ 'Picked up _JAVA_OPTIONS',
345
+ 'Picked up JAVA_TOOL_OPTIONS',
346
+ 'Using incubator modules',
347
+ 'WARNING:',
348
+ 'log4j:',
349
+ 'SLF4J:',
350
+ ]):
351
+ print(line, file=sys.stderr)
352
+
353
+ # Cache the session for reuse (thread-safe)
354
+ with _SPARK_SESSION_LOCK:
355
+ _SPARK_SESSION_CACHE[cache_key] = spark
356
+
357
+ return spark
358
+
359
+ def _get_jdbc_jars(self, cache_dir: str) -> list:
360
+ """
361
+ Discover ALL JDBC JAR files from project cache at runtime.
362
+
363
+ v0.5.96: Dynamic discovery - finds all *.jar files in .dvt/jdbc_jars/
364
+ This enables project folder portability (move folder → JARs still work).
365
+
366
+ JARs are downloaded via 'dvt target sync' command.
367
+
368
+ :param cache_dir: Directory to look for JAR files (ignored, uses project dir)
369
+ :returns: List of JAR file absolute paths
370
+ """
371
+ import glob
372
+
373
+ # Look for JARs in project directory (current working directory)
374
+ project_dir = os.getcwd()
375
+ jar_cache_dir = os.path.join(project_dir, ".dvt", "jdbc_jars")
376
+
377
+ # Discover ALL *.jar files dynamically (not hardcoded list)
378
+ jar_pattern = os.path.join(jar_cache_dir, "*.jar")
379
+ jar_paths = sorted(glob.glob(jar_pattern))
380
+
381
+ # No warning needed - clean output
382
+ # User should run 'dvt target sync' if JARs needed
383
+
384
+ return jar_paths
385
+
386
+ def close(self, spark: Optional[SparkSession]) -> None:
387
+ """
388
+ Close Spark session after execution.
389
+
390
+ By default, closes the session to free resources and prevent blocking other models.
391
+ Session caching can be enabled by setting DVT_SPARK_KEEP_ALIVE=1 for faster
392
+ consecutive runs within the same Python process.
393
+
394
+ Set DVT_SPARK_KEEP_ALIVE=1 environment variable to keep sessions alive (advanced).
395
+
396
+ :param spark: SparkSession to close (or optionally keep alive)
397
+ """
398
+ import os
399
+
400
+ # Check if caching is enabled (opt-in, not default)
401
+ keep_alive = os.environ.get("DVT_SPARK_KEEP_ALIVE", "0") == "1"
402
+
403
+ if keep_alive:
404
+ # DVT v0.4.8: Suppressed verbose output
405
+ # Session stays alive in cache for reuse (opt-in)
406
+ # print("[DVT] Spark session kept alive in cache (DVT_SPARK_KEEP_ALIVE=1)", flush=True)
407
+ pass
408
+ elif spark:
409
+ try:
410
+ # Clear from cache first (thread-safe)
411
+ with _SPARK_SESSION_LOCK:
412
+ for key, cached_spark in list(_SPARK_SESSION_CACHE.items()):
413
+ if cached_spark is spark:
414
+ del _SPARK_SESSION_CACHE[key]
415
+ break
416
+
417
+ # Stop the session
418
+ spark.stop()
419
+ # DVT v0.4.8: Suppressed verbose output
420
+ # print("[DVT] ✓ Spark session closed", flush=True)
421
+ except Exception:
422
+ pass # Best effort cleanup
423
+
424
+ def estimate_cost(self, duration_minutes: float) -> float:
425
+ """
426
+ Estimate cost for local execution.
427
+
428
+ Local execution is free (runs on local machine).
429
+
430
+ :param duration_minutes: Estimated query duration
431
+ :returns: 0.0 (free)
432
+ """
433
+ return 0.0
434
+
435
+ def get_platform_name(self) -> str:
436
+ """Get platform name."""
437
+ return "local"
438
+
439
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
440
+ """
441
+ Get Spark config for JDBC JAR provisioning using local file paths.
442
+
443
+ Local Spark uses spark.jars with local file paths from .dvt/jdbc_jars/
444
+ for instant startup (no download at runtime).
445
+
446
+ :param adapter_types: Set of adapter types (ignored - uses all JARs found)
447
+ :returns: Dictionary with spark.jars config
448
+ """
449
+ from dbt.compute.jar_provisioning import LocalJARProvisioning
450
+
451
+ provisioning = LocalJARProvisioning(project_dir=os.getcwd())
452
+ return provisioning.get_spark_config(adapter_types)
453
+
454
+ def test_connectivity(self) -> Tuple[bool, str]:
455
+ """
456
+ Test connectivity by creating a local Spark session.
457
+
458
+ :returns: Tuple of (success, message)
459
+ """
460
+ # Check PySpark at runtime (not module import time)
461
+ try:
462
+ from pyspark.sql import SparkSession as _ # noqa: F401
463
+ except ImportError:
464
+ return (False, "PySpark not installed")
465
+
466
+ try:
467
+ spark = self.get_spark_session()
468
+ # Run simple SQL to verify
469
+ spark.sql("SELECT 1 AS test").collect()
470
+ return (True, "Local Spark session created and SQL test passed")
471
+ except Exception as e:
472
+ return (False, f"Local Spark failed: {e}")