dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
@@ -0,0 +1,207 @@
1
+ """
2
+ GCP Dataproc Spark Connection Strategy
3
+
4
+ Provides connection to Google Cloud Dataproc Spark clusters.
5
+
6
+ v0.5.98: New strategy for GCP Dataproc clusters with Maven-based JAR provisioning.
7
+
8
+ Configuration:
9
+ {
10
+ "project": "my-gcp-project", # Required: GCP project ID
11
+ "region": "us-central1", # Required: Dataproc region
12
+ "cluster": "my-dataproc-cluster", # Required: Cluster name
13
+ "spark.driver.memory": "4g", # Optional: driver memory
14
+ "spark.executor.memory": "8g", # Optional: executor memory
15
+ }
16
+
17
+ Requirements:
18
+ - GCP Dataproc cluster must be running
19
+ - gcloud SDK configured (gcloud auth login)
20
+ - Dataproc connector or direct YARN access
21
+
22
+ Cost Estimate:
23
+ - Typical 5-node Dataproc cluster: ~$1.00/hr (n1-standard-4 instances)
24
+ - Dataproc pricing includes Spark/Hadoop runtime at no extra cost
25
+ """
26
+
27
+ from typing import Any, Dict, Optional, Set, Tuple
28
+
29
+ from dbt.compute.strategies.base import BaseConnectionStrategy
30
+ from dbt_common.exceptions import DbtRuntimeError
31
+
32
+ try:
33
+ from pyspark.sql import SparkSession
34
+
35
+ PYSPARK_AVAILABLE = True
36
+ except ImportError:
37
+ PYSPARK_AVAILABLE = False
38
+ SparkSession = None
39
+
40
+
41
+ class DataprocStrategy(BaseConnectionStrategy):
42
+ """
43
+ GCP Dataproc Spark cluster connection strategy.
44
+
45
+ Connects to Dataproc clusters using YARN as the resource manager.
46
+ Uses spark.jars.packages for JDBC JAR provisioning.
47
+ """
48
+
49
+ def validate_config(self) -> None:
50
+ """
51
+ Validate Dataproc strategy configuration.
52
+
53
+ Required:
54
+ - project: GCP project ID
55
+ - region: Dataproc region
56
+ - cluster: Cluster name
57
+
58
+ :raises DbtRuntimeError: If configuration is invalid
59
+ """
60
+ if not isinstance(self.config, dict):
61
+ raise DbtRuntimeError(
62
+ f"Dataproc config must be a dictionary, got {type(self.config)}"
63
+ )
64
+
65
+ # Check required fields
66
+ required_fields = ["project", "region", "cluster"]
67
+ missing = [f for f in required_fields if f not in self.config]
68
+ if missing:
69
+ raise DbtRuntimeError(
70
+ f"Dataproc config missing required fields: {', '.join(missing)}"
71
+ )
72
+
73
+ def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
74
+ """
75
+ Create Spark session connected to Dataproc cluster.
76
+
77
+ :param adapter_types: Set of adapter types that need JDBC drivers
78
+ :returns: Initialized SparkSession connected to Dataproc
79
+ :raises DbtRuntimeError: If session creation fails
80
+ """
81
+ if not PYSPARK_AVAILABLE:
82
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
83
+
84
+ try:
85
+ builder = SparkSession.builder.appName(self.app_name)
86
+
87
+ # Set YARN master for Dataproc
88
+ builder = builder.master("yarn")
89
+
90
+ # Get JDBC JAR config
91
+ if adapter_types is None:
92
+ from dbt.compute.jar_provisioning import get_required_adapter_types
93
+ adapter_types = get_required_adapter_types()
94
+
95
+ if adapter_types:
96
+ jar_config = self.get_jar_provisioning_config(adapter_types)
97
+ for key, value in jar_config.items():
98
+ builder = builder.config(key, value)
99
+
100
+ # Apply user-provided configs
101
+ for key, value in self.config.items():
102
+ if key not in ("project", "region", "cluster"):
103
+ builder = builder.config(key, value)
104
+
105
+ # Default Dataproc optimizations
106
+ default_configs = {
107
+ "spark.submit.deployMode": "client",
108
+ "spark.dynamicAllocation.enabled": "true",
109
+ "spark.sql.execution.arrow.pyspark.enabled": "true",
110
+ }
111
+ for key, value in default_configs.items():
112
+ if key not in self.config:
113
+ builder = builder.config(key, value)
114
+
115
+ # Create session
116
+ spark = builder.getOrCreate()
117
+ spark.sparkContext.setLogLevel("WARN")
118
+
119
+ return spark
120
+
121
+ except Exception as e:
122
+ error_msg = str(e)
123
+ if "Connection refused" in error_msg:
124
+ raise DbtRuntimeError(
125
+ f"Cannot connect to Dataproc cluster '{self.config.get('cluster')}'. "
126
+ f"Ensure the cluster is running. Error: {error_msg}"
127
+ ) from e
128
+ raise DbtRuntimeError(f"Failed to create Dataproc Spark session: {error_msg}") from e
129
+
130
+ def close(self, spark: Optional[SparkSession]) -> None:
131
+ """
132
+ Clean up Spark session.
133
+
134
+ For Dataproc, we stop the application but the cluster continues running.
135
+
136
+ :param spark: SparkSession to clean up
137
+ """
138
+ if spark:
139
+ try:
140
+ spark.stop()
141
+ except Exception:
142
+ pass # Best effort cleanup
143
+
144
+ def estimate_cost(self, duration_minutes: float) -> float:
145
+ """
146
+ Estimate cost for Dataproc execution.
147
+
148
+ Based on typical 5-node Dataproc cluster with n1-standard-4 instances.
149
+
150
+ :param duration_minutes: Estimated query duration in minutes
151
+ :returns: Estimated cost in USD
152
+ """
153
+ # Typical Dataproc cluster: 5x n1-standard-4 @ ~$0.19/hr each = ~$0.95/hr total
154
+ # Plus Dataproc fee: $0.01/vCPU/hr = ~$0.20/hr for 20 vCPUs
155
+ hourly_cost = 1.15
156
+ hours = duration_minutes / 60.0
157
+ return round(hourly_cost * hours, 2)
158
+
159
+ def get_platform_name(self) -> str:
160
+ """Get platform name."""
161
+ return "dataproc"
162
+
163
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
164
+ """
165
+ Get Spark config for JDBC JAR provisioning using Maven coordinates.
166
+
167
+ Dataproc clusters download JDBC drivers from Maven Central at session startup.
168
+
169
+ :param adapter_types: Set of adapter types that need JDBC drivers
170
+ :returns: Dictionary with spark.jars.packages config
171
+ """
172
+ from dbt.compute.jar_provisioning import RemoteJARProvisioning
173
+
174
+ provisioning = RemoteJARProvisioning()
175
+ return provisioning.get_spark_config(adapter_types)
176
+
177
+ def test_connectivity(self) -> Tuple[bool, str]:
178
+ """
179
+ Test connectivity to Dataproc cluster.
180
+
181
+ :returns: Tuple of (success, message)
182
+ """
183
+ if not PYSPARK_AVAILABLE:
184
+ return (False, "PySpark not installed")
185
+
186
+ try:
187
+ spark = self.get_spark_session()
188
+ spark.sql("SELECT 1 AS test").collect()
189
+ return (True, "Dataproc session created and SQL test passed")
190
+ except Exception as e:
191
+ error_msg = str(e)
192
+ if "Connection refused" in error_msg:
193
+ return (False, "Cannot connect to Dataproc cluster (connection refused)")
194
+ return (False, f"Dataproc connection failed: {e}")
195
+
196
+ def get_cluster_info(self) -> Dict[str, Any]:
197
+ """
198
+ Get information about the Dataproc configuration.
199
+
200
+ :returns: Dictionary with cluster metadata
201
+ """
202
+ return {
203
+ "platform": "dataproc",
204
+ "project": self.config.get("project", "unknown"),
205
+ "region": self.config.get("region", "unknown"),
206
+ "cluster": self.config.get("cluster", "unknown"),
207
+ }
@@ -0,0 +1,203 @@
1
+ """
2
+ AWS EMR (Elastic MapReduce) Spark Connection Strategy
3
+
4
+ Provides connection to AWS EMR Spark clusters via YARN.
5
+
6
+ v0.5.98: New strategy for AWS EMR clusters with Maven-based JAR provisioning.
7
+
8
+ Configuration:
9
+ {
10
+ "master": "yarn", # Required: YARN resource manager
11
+ "spark.submit.deployMode": "client", # Optional: client or cluster
12
+ "spark.driver.memory": "4g", # Optional: driver memory
13
+ "spark.executor.memory": "8g", # Optional: executor memory
14
+ "spark.executor.instances": "4", # Optional: number of executors
15
+ }
16
+
17
+ Requirements:
18
+ - AWS EMR cluster must be running
19
+ - AWS credentials configured (aws configure or IAM role)
20
+ - Spark must be accessible from client machine (e.g., via SSH tunnel or VPN)
21
+
22
+ Cost Estimate:
23
+ - Typical 5-node EMR cluster: ~$1.20/hr (m5.xlarge instances)
24
+ - On-demand pricing varies by instance type and region
25
+ """
26
+
27
+ from typing import Any, Dict, Optional, Set, Tuple
28
+
29
+ from dbt.compute.strategies.base import BaseConnectionStrategy
30
+ from dbt_common.exceptions import DbtRuntimeError
31
+
32
+ try:
33
+ from pyspark.sql import SparkSession
34
+
35
+ PYSPARK_AVAILABLE = True
36
+ except ImportError:
37
+ PYSPARK_AVAILABLE = False
38
+ SparkSession = None
39
+
40
+
41
+ class EMRStrategy(BaseConnectionStrategy):
42
+ """
43
+ AWS EMR Spark cluster connection strategy.
44
+
45
+ Connects to EMR clusters using YARN as the resource manager.
46
+ Uses spark.jars.packages for JDBC JAR provisioning.
47
+ """
48
+
49
+ def validate_config(self) -> None:
50
+ """
51
+ Validate EMR strategy configuration.
52
+
53
+ Required:
54
+ - master: Must be "yarn" for EMR
55
+
56
+ :raises DbtRuntimeError: If configuration is invalid
57
+ """
58
+ if not isinstance(self.config, dict):
59
+ raise DbtRuntimeError(
60
+ f"EMR config must be a dictionary, got {type(self.config)}"
61
+ )
62
+
63
+ # Check master is yarn
64
+ master = self.config.get("master", "")
65
+ if master.lower() != "yarn":
66
+ raise DbtRuntimeError(
67
+ f"EMR config requires master='yarn', got: {master}"
68
+ )
69
+
70
+ def get_spark_session(self, adapter_types: Optional[Set[str]] = None) -> SparkSession:
71
+ """
72
+ Create Spark session connected to EMR cluster via YARN.
73
+
74
+ :param adapter_types: Set of adapter types that need JDBC drivers
75
+ :returns: Initialized SparkSession connected to EMR
76
+ :raises DbtRuntimeError: If session creation fails
77
+ """
78
+ if not PYSPARK_AVAILABLE:
79
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
80
+
81
+ try:
82
+ builder = SparkSession.builder.appName(self.app_name)
83
+
84
+ # Set YARN master
85
+ builder = builder.master("yarn")
86
+
87
+ # Get JDBC JAR config
88
+ if adapter_types is None:
89
+ from dbt.compute.jar_provisioning import get_required_adapter_types
90
+ adapter_types = get_required_adapter_types()
91
+
92
+ if adapter_types:
93
+ jar_config = self.get_jar_provisioning_config(adapter_types)
94
+ for key, value in jar_config.items():
95
+ builder = builder.config(key, value)
96
+
97
+ # Apply user-provided configs
98
+ for key, value in self.config.items():
99
+ if key != "master": # master already set
100
+ builder = builder.config(key, value)
101
+
102
+ # Default EMR optimizations
103
+ default_configs = {
104
+ "spark.submit.deployMode": "client",
105
+ "spark.dynamicAllocation.enabled": "true",
106
+ "spark.sql.execution.arrow.pyspark.enabled": "true",
107
+ }
108
+ for key, value in default_configs.items():
109
+ if key not in self.config:
110
+ builder = builder.config(key, value)
111
+
112
+ # Create session
113
+ spark = builder.getOrCreate()
114
+ spark.sparkContext.setLogLevel("WARN")
115
+
116
+ return spark
117
+
118
+ except Exception as e:
119
+ error_msg = str(e)
120
+ if "Connection refused" in error_msg:
121
+ raise DbtRuntimeError(
122
+ f"Cannot connect to EMR cluster. Ensure the cluster is running "
123
+ f"and accessible from this machine. Error: {error_msg}"
124
+ ) from e
125
+ raise DbtRuntimeError(f"Failed to create EMR Spark session: {error_msg}") from e
126
+
127
+ def close(self, spark: Optional[SparkSession]) -> None:
128
+ """
129
+ Clean up Spark session.
130
+
131
+ For EMR, we stop the application but the cluster continues running.
132
+
133
+ :param spark: SparkSession to clean up
134
+ """
135
+ if spark:
136
+ try:
137
+ spark.stop()
138
+ except Exception:
139
+ pass # Best effort cleanup
140
+
141
+ def estimate_cost(self, duration_minutes: float) -> float:
142
+ """
143
+ Estimate cost for EMR execution.
144
+
145
+ Based on typical 5-node EMR cluster with m5.xlarge instances.
146
+
147
+ :param duration_minutes: Estimated query duration in minutes
148
+ :returns: Estimated cost in USD
149
+ """
150
+ # Typical EMR cluster: 5x m5.xlarge @ ~$0.24/hr each = ~$1.20/hr total
151
+ hourly_cost = 1.20
152
+ hours = duration_minutes / 60.0
153
+ return round(hourly_cost * hours, 2)
154
+
155
+ def get_platform_name(self) -> str:
156
+ """Get platform name."""
157
+ return "emr"
158
+
159
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
160
+ """
161
+ Get Spark config for JDBC JAR provisioning using Maven coordinates.
162
+
163
+ EMR clusters download JDBC drivers from Maven Central at session startup.
164
+
165
+ :param adapter_types: Set of adapter types that need JDBC drivers
166
+ :returns: Dictionary with spark.jars.packages config
167
+ """
168
+ from dbt.compute.jar_provisioning import RemoteJARProvisioning
169
+
170
+ provisioning = RemoteJARProvisioning()
171
+ return provisioning.get_spark_config(adapter_types)
172
+
173
+ def test_connectivity(self) -> Tuple[bool, str]:
174
+ """
175
+ Test connectivity to EMR cluster.
176
+
177
+ :returns: Tuple of (success, message)
178
+ """
179
+ if not PYSPARK_AVAILABLE:
180
+ return (False, "PySpark not installed")
181
+
182
+ try:
183
+ spark = self.get_spark_session()
184
+ spark.sql("SELECT 1 AS test").collect()
185
+ return (True, "EMR session created and SQL test passed")
186
+ except Exception as e:
187
+ error_msg = str(e)
188
+ if "Connection refused" in error_msg:
189
+ return (False, "Cannot connect to EMR cluster (connection refused)")
190
+ return (False, f"EMR connection failed: {e}")
191
+
192
+ def get_cluster_info(self) -> Dict[str, Any]:
193
+ """
194
+ Get information about the EMR configuration.
195
+
196
+ :returns: Dictionary with cluster metadata
197
+ """
198
+ return {
199
+ "platform": "emr",
200
+ "master": self.config.get("master", "yarn"),
201
+ "deploy_mode": self.config.get("spark.submit.deployMode", "client"),
202
+ "executor_instances": self.config.get("spark.executor.instances", "dynamic"),
203
+ }