dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2403 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
  74. dbt/compute/engines/spark_engine.py +642 -0
  75. dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
  76. dbt/compute/federated_executor.py +1080 -0
  77. dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
  78. dbt/compute/filter_pushdown.py +273 -0
  79. dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
  80. dbt/compute/jar_provisioning.py +255 -0
  81. dbt/compute/java_compat.cpython-311-darwin.so +0 -0
  82. dbt/compute/java_compat.py +689 -0
  83. dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
  84. dbt/compute/jdbc_utils.py +678 -0
  85. dbt/compute/metadata/__init__.py +40 -0
  86. dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
  87. dbt/compute/metadata/adapters_registry.py +370 -0
  88. dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
  89. dbt/compute/metadata/registry.py +674 -0
  90. dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
  91. dbt/compute/metadata/store.py +1499 -0
  92. dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
  93. dbt/compute/smart_selector.py +377 -0
  94. dbt/compute/strategies/__init__.py +55 -0
  95. dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
  96. dbt/compute/strategies/base.py +165 -0
  97. dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
  98. dbt/compute/strategies/dataproc.py +207 -0
  99. dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
  100. dbt/compute/strategies/emr.py +203 -0
  101. dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
  102. dbt/compute/strategies/local.py +443 -0
  103. dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
  104. dbt/compute/strategies/standalone.py +262 -0
  105. dbt/config/__init__.py +4 -0
  106. dbt/config/catalogs.py +94 -0
  107. dbt/config/compute.cpython-311-darwin.so +0 -0
  108. dbt/config/compute.py +513 -0
  109. dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
  110. dbt/config/dvt_profile.py +342 -0
  111. dbt/config/profile.py +422 -0
  112. dbt/config/project.py +873 -0
  113. dbt/config/project_utils.py +28 -0
  114. dbt/config/renderer.py +231 -0
  115. dbt/config/runtime.py +553 -0
  116. dbt/config/selectors.py +208 -0
  117. dbt/config/utils.py +77 -0
  118. dbt/constants.py +28 -0
  119. dbt/context/__init__.py +0 -0
  120. dbt/context/base.py +745 -0
  121. dbt/context/configured.py +135 -0
  122. dbt/context/context_config.py +382 -0
  123. dbt/context/docs.py +82 -0
  124. dbt/context/exceptions_jinja.py +178 -0
  125. dbt/context/macro_resolver.py +195 -0
  126. dbt/context/macros.py +171 -0
  127. dbt/context/manifest.py +72 -0
  128. dbt/context/providers.py +2249 -0
  129. dbt/context/query_header.py +13 -0
  130. dbt/context/secret.py +58 -0
  131. dbt/context/target.py +74 -0
  132. dbt/contracts/__init__.py +0 -0
  133. dbt/contracts/files.py +413 -0
  134. dbt/contracts/graph/__init__.py +0 -0
  135. dbt/contracts/graph/manifest.py +1904 -0
  136. dbt/contracts/graph/metrics.py +97 -0
  137. dbt/contracts/graph/model_config.py +70 -0
  138. dbt/contracts/graph/node_args.py +42 -0
  139. dbt/contracts/graph/nodes.py +1806 -0
  140. dbt/contracts/graph/semantic_manifest.py +232 -0
  141. dbt/contracts/graph/unparsed.py +811 -0
  142. dbt/contracts/project.py +417 -0
  143. dbt/contracts/results.py +53 -0
  144. dbt/contracts/selection.py +23 -0
  145. dbt/contracts/sql.py +85 -0
  146. dbt/contracts/state.py +68 -0
  147. dbt/contracts/util.py +46 -0
  148. dbt/deprecations.py +348 -0
  149. dbt/deps/__init__.py +0 -0
  150. dbt/deps/base.py +152 -0
  151. dbt/deps/git.py +195 -0
  152. dbt/deps/local.py +79 -0
  153. dbt/deps/registry.py +130 -0
  154. dbt/deps/resolver.py +149 -0
  155. dbt/deps/tarball.py +120 -0
  156. dbt/docs/source/_ext/dbt_click.py +119 -0
  157. dbt/docs/source/conf.py +32 -0
  158. dbt/env_vars.py +64 -0
  159. dbt/event_time/event_time.py +40 -0
  160. dbt/event_time/sample_window.py +60 -0
  161. dbt/events/__init__.py +15 -0
  162. dbt/events/base_types.py +36 -0
  163. dbt/events/core_types_pb2.py +2 -0
  164. dbt/events/logging.py +108 -0
  165. dbt/events/types.py +2516 -0
  166. dbt/exceptions.py +1486 -0
  167. dbt/flags.py +89 -0
  168. dbt/graph/__init__.py +11 -0
  169. dbt/graph/cli.py +249 -0
  170. dbt/graph/graph.py +172 -0
  171. dbt/graph/queue.py +214 -0
  172. dbt/graph/selector.py +374 -0
  173. dbt/graph/selector_methods.py +975 -0
  174. dbt/graph/selector_spec.py +222 -0
  175. dbt/graph/thread_pool.py +18 -0
  176. dbt/hooks.py +21 -0
  177. dbt/include/README.md +49 -0
  178. dbt/include/__init__.py +3 -0
  179. dbt/include/data/adapters_registry.duckdb +0 -0
  180. dbt/include/data/build_registry.py +242 -0
  181. dbt/include/data/csv/adapter_queries.csv +33 -0
  182. dbt/include/data/csv/syntax_rules.csv +9 -0
  183. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  184. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  185. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  186. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  187. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  188. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  189. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  190. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  191. dbt/include/starter_project/.gitignore +4 -0
  192. dbt/include/starter_project/README.md +15 -0
  193. dbt/include/starter_project/__init__.py +3 -0
  194. dbt/include/starter_project/analyses/.gitkeep +0 -0
  195. dbt/include/starter_project/dbt_project.yml +36 -0
  196. dbt/include/starter_project/macros/.gitkeep +0 -0
  197. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  198. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  199. dbt/include/starter_project/models/example/schema.yml +21 -0
  200. dbt/include/starter_project/seeds/.gitkeep +0 -0
  201. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  202. dbt/include/starter_project/tests/.gitkeep +0 -0
  203. dbt/internal_deprecations.py +26 -0
  204. dbt/jsonschemas/__init__.py +3 -0
  205. dbt/jsonschemas/jsonschemas.py +309 -0
  206. dbt/jsonschemas/project/0.0.110.json +4717 -0
  207. dbt/jsonschemas/project/0.0.85.json +2015 -0
  208. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  209. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  210. dbt/jsonschemas/resources/latest.json +6773 -0
  211. dbt/links.py +4 -0
  212. dbt/materializations/__init__.py +0 -0
  213. dbt/materializations/incremental/__init__.py +0 -0
  214. dbt/materializations/incremental/microbatch.py +236 -0
  215. dbt/mp_context.py +8 -0
  216. dbt/node_types.py +37 -0
  217. dbt/parser/__init__.py +23 -0
  218. dbt/parser/analysis.py +21 -0
  219. dbt/parser/base.py +548 -0
  220. dbt/parser/common.py +266 -0
  221. dbt/parser/docs.py +52 -0
  222. dbt/parser/fixtures.py +51 -0
  223. dbt/parser/functions.py +30 -0
  224. dbt/parser/generic_test.py +100 -0
  225. dbt/parser/generic_test_builders.py +333 -0
  226. dbt/parser/hooks.py +118 -0
  227. dbt/parser/macros.py +137 -0
  228. dbt/parser/manifest.py +2204 -0
  229. dbt/parser/models.py +573 -0
  230. dbt/parser/partial.py +1178 -0
  231. dbt/parser/read_files.py +445 -0
  232. dbt/parser/schema_generic_tests.py +422 -0
  233. dbt/parser/schema_renderer.py +111 -0
  234. dbt/parser/schema_yaml_readers.py +935 -0
  235. dbt/parser/schemas.py +1466 -0
  236. dbt/parser/search.py +149 -0
  237. dbt/parser/seeds.py +28 -0
  238. dbt/parser/singular_test.py +20 -0
  239. dbt/parser/snapshots.py +44 -0
  240. dbt/parser/sources.py +558 -0
  241. dbt/parser/sql.py +62 -0
  242. dbt/parser/unit_tests.py +621 -0
  243. dbt/plugins/__init__.py +20 -0
  244. dbt/plugins/contracts.py +9 -0
  245. dbt/plugins/exceptions.py +2 -0
  246. dbt/plugins/manager.py +163 -0
  247. dbt/plugins/manifest.py +21 -0
  248. dbt/profiler.py +20 -0
  249. dbt/py.typed +1 -0
  250. dbt/query_analyzer.cpython-311-darwin.so +0 -0
  251. dbt/query_analyzer.py +410 -0
  252. dbt/runners/__init__.py +2 -0
  253. dbt/runners/exposure_runner.py +7 -0
  254. dbt/runners/no_op_runner.py +45 -0
  255. dbt/runners/saved_query_runner.py +7 -0
  256. dbt/selected_resources.py +8 -0
  257. dbt/task/__init__.py +0 -0
  258. dbt/task/base.py +503 -0
  259. dbt/task/build.py +197 -0
  260. dbt/task/clean.py +56 -0
  261. dbt/task/clone.py +161 -0
  262. dbt/task/compile.py +150 -0
  263. dbt/task/compute.cpython-311-darwin.so +0 -0
  264. dbt/task/compute.py +458 -0
  265. dbt/task/debug.py +505 -0
  266. dbt/task/deps.py +280 -0
  267. dbt/task/docs/__init__.py +3 -0
  268. dbt/task/docs/api/__init__.py +23 -0
  269. dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
  270. dbt/task/docs/api/catalog.py +204 -0
  271. dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
  272. dbt/task/docs/api/lineage.py +234 -0
  273. dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
  274. dbt/task/docs/api/profile.py +204 -0
  275. dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
  276. dbt/task/docs/api/spark.py +186 -0
  277. dbt/task/docs/generate.py +947 -0
  278. dbt/task/docs/index.html +250 -0
  279. dbt/task/docs/serve.cpython-311-darwin.so +0 -0
  280. dbt/task/docs/serve.py +174 -0
  281. dbt/task/dvt_output.py +362 -0
  282. dbt/task/dvt_run.py +204 -0
  283. dbt/task/freshness.py +322 -0
  284. dbt/task/function.py +121 -0
  285. dbt/task/group_lookup.py +46 -0
  286. dbt/task/init.cpython-311-darwin.so +0 -0
  287. dbt/task/init.py +604 -0
  288. dbt/task/java.cpython-311-darwin.so +0 -0
  289. dbt/task/java.py +316 -0
  290. dbt/task/list.py +236 -0
  291. dbt/task/metadata.cpython-311-darwin.so +0 -0
  292. dbt/task/metadata.py +804 -0
  293. dbt/task/printer.py +175 -0
  294. dbt/task/profile.cpython-311-darwin.so +0 -0
  295. dbt/task/profile.py +1307 -0
  296. dbt/task/profile_serve.py +615 -0
  297. dbt/task/retract.py +438 -0
  298. dbt/task/retry.py +175 -0
  299. dbt/task/run.py +1387 -0
  300. dbt/task/run_operation.py +141 -0
  301. dbt/task/runnable.py +758 -0
  302. dbt/task/seed.py +103 -0
  303. dbt/task/show.py +149 -0
  304. dbt/task/snapshot.py +56 -0
  305. dbt/task/spark.cpython-311-darwin.so +0 -0
  306. dbt/task/spark.py +414 -0
  307. dbt/task/sql.py +110 -0
  308. dbt/task/target_sync.cpython-311-darwin.so +0 -0
  309. dbt/task/target_sync.py +766 -0
  310. dbt/task/test.py +464 -0
  311. dbt/tests/fixtures/__init__.py +1 -0
  312. dbt/tests/fixtures/project.py +620 -0
  313. dbt/tests/util.py +651 -0
  314. dbt/tracking.py +529 -0
  315. dbt/utils/__init__.py +3 -0
  316. dbt/utils/artifact_upload.py +151 -0
  317. dbt/utils/utils.py +408 -0
  318. dbt/version.py +270 -0
  319. dvt_cli/__init__.py +72 -0
  320. dvt_core-0.58.6.dist-info/METADATA +288 -0
  321. dvt_core-0.58.6.dist-info/RECORD +324 -0
  322. dvt_core-0.58.6.dist-info/WHEEL +5 -0
  323. dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
  324. dvt_core-0.58.6.dist-info/top_level.txt +2 -0
dbt/config/compute.py ADDED
@@ -0,0 +1,513 @@
1
+ """
2
+ Compute Cluster Registry
3
+
4
+ Manages external compute cluster configurations for DVT.
5
+
6
+ v0.55.0: Computes stored in <project>/.dvt/computes.yml (project-level)
7
+ Managed exclusively via `dvt compute` CLI commands.
8
+ Contains comprehensive commented samples for all platforms.
9
+ """
10
+
11
+ import os
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ import yaml
18
+ from dbt_common.exceptions import DbtRuntimeError
19
+
20
+
21
+ def get_project_dvt_dir(project_dir) -> Path:
22
+ """Get the DVT config directory for a project (<project>/.dvt/).
23
+
24
+ :param project_dir: Path to project root directory (str or Path)
25
+ """
26
+ return Path(project_dir) / ".dvt"
27
+
28
+
29
+ class SparkPlatform(Enum):
30
+ """Spark platform types for connection strategies.
31
+
32
+ v0.51.2: Removed DATABRICKS (serverless cannot read external JDBC sources).
33
+ """
34
+
35
+ LOCAL = "local"
36
+ EMR = "emr"
37
+ DATAPROC = "dataproc"
38
+ STANDALONE = "standalone" # Self-managed Spark clusters (spark://)
39
+ EXTERNAL = "external" # Generic external cluster (fallback)
40
+
41
+
42
+ # Default computes.yml template with comprehensive commented samples
43
+ DEFAULT_COMPUTES_YAML = """# ============================================================================
44
+ # DVT Compute Engines Configuration (v0.5.98)
45
+ # ============================================================================
46
+ # This file defines Spark compute engines for federated query execution.
47
+ #
48
+ # Commands:
49
+ # dvt compute test Test connectivity to all compute engines
50
+ # dvt compute edit Open this file in your editor
51
+ # dvt compute validate Validate YAML syntax
52
+ #
53
+ # JDBC JAR Provisioning (v0.5.98):
54
+ # - Local Spark: Uses spark.jars with local file paths (fast startup)
55
+ # - Remote clusters: Uses spark.jars.packages with Maven coordinates
56
+ # (workers download JARs from Maven Central at session start)
57
+ #
58
+ # Platform Detection:
59
+ # DVT auto-detects the platform from config keys:
60
+ # - Dataproc: project + region + cluster
61
+ # - EMR: master=yarn (without Dataproc keys)
62
+ # - Standalone: master=spark://...
63
+ # - Local: master=local[*] or no master
64
+ # ============================================================================
65
+
66
+ # Default compute engine (must match a name in 'computes' section)
67
+ target_compute: spark-local
68
+
69
+ # ============================================================================
70
+ # COMPUTE ENGINES
71
+ # ============================================================================
72
+ # Each compute engine must have:
73
+ # - type: 'spark' (currently the only supported type)
74
+ # - config: Spark configuration options
75
+ # - description: (optional) Human-readable description
76
+ # ============================================================================
77
+
78
+ computes:
79
+
80
+ # --------------------------------------------------------------------------
81
+ # LOCAL SPARK (Default - Works out of the box)
82
+ # --------------------------------------------------------------------------
83
+ # Embedded PySpark for development and small-medium datasets.
84
+ # Uses spark.jars with local file paths for fast startup.
85
+ # JDBC JARs are auto-discovered from profiles.yml connections.
86
+ #
87
+ # Cost: Free (runs on your local machine)
88
+ # Best for: Development, testing, datasets < 10GB
89
+ # --------------------------------------------------------------------------
90
+ spark-local:
91
+ type: spark
92
+ description: "Local Spark for development and testing"
93
+ config:
94
+ master: "local[2]" # Use 2 CPU cores (local[*] for all)
95
+ spark.driver.memory: "2g" # Driver memory
96
+ spark.executor.memory: "2g" # Executor memory
97
+ spark.ui.enabled: "false" # Disable Spark UI
98
+ spark.ui.showConsoleProgress: "false" # No progress bars
99
+ # Spark 4.0 legacy compatibility flags
100
+ spark.sql.legacy.postgres.datetimeMapping.enabled: "true"
101
+ spark.sql.legacy.mysql.timestampNTZMapping.enabled: "true"
102
+ spark.sql.legacy.oracle.timestampMapping.enabled: "true"
103
+ spark.sql.legacy.mssqlserver.numericMapping.enabled: "true"
104
+ # Performance optimizations
105
+ spark.sql.shuffle.partitions: "8"
106
+ spark.sql.execution.arrow.pyspark.enabled: "true"
107
+ spark.sql.execution.arrow.pyspark.fallback.enabled: "true"
108
+ spark.sql.adaptive.enabled: "true"
109
+ spark.sql.adaptive.coalescePartitions.enabled: "true"
110
+
111
+ # --------------------------------------------------------------------------
112
+ # AWS EMR (Elastic MapReduce)
113
+ # --------------------------------------------------------------------------
114
+ # Connects to AWS EMR clusters via YARN.
115
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
116
+ #
117
+ # Requirements:
118
+ # - AWS credentials configured (aws configure or IAM role)
119
+ # - EMR cluster must be running
120
+ # - Network access to EMR master node
121
+ #
122
+ # Cost: ~$1.20/hr (typical 5-node m5.xlarge cluster)
123
+ # Best for: AWS-native workloads, S3 data integration
124
+ # --------------------------------------------------------------------------
125
+ # emr-cluster:
126
+ # type: spark
127
+ # description: "AWS EMR Spark Cluster"
128
+ # config:
129
+ # master: "yarn" # Required: YARN resource manager
130
+ # spark.submit.deployMode: "client" # Client mode for interactive
131
+ # spark.driver.memory: "4g"
132
+ # spark.executor.memory: "8g"
133
+ # spark.executor.instances: "4"
134
+ # spark.dynamicAllocation.enabled: "true"
135
+
136
+ # --------------------------------------------------------------------------
137
+ # GCP DATAPROC (Google Cloud Spark)
138
+ # --------------------------------------------------------------------------
139
+ # Connects to GCP Dataproc clusters via YARN.
140
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
141
+ #
142
+ # Requirements:
143
+ # - gcloud SDK configured (gcloud auth login)
144
+ # - Dataproc cluster must be running
145
+ # - Network access to Dataproc master
146
+ #
147
+ # Cost: ~$1.15/hr (typical 5-node n1-standard-4 cluster)
148
+ # Best for: GCP-native workloads, BigQuery/GCS integration
149
+ # --------------------------------------------------------------------------
150
+ # dataproc-cluster:
151
+ # type: spark
152
+ # description: "GCP Dataproc Cluster"
153
+ # config:
154
+ # project: "my-gcp-project" # Required: GCP project ID
155
+ # region: "us-central1" # Required: Dataproc region
156
+ # cluster: "my-dataproc-cluster" # Required: Cluster name
157
+ # spark.driver.memory: "4g"
158
+ # spark.executor.memory: "8g"
159
+ # spark.dynamicAllocation.enabled: "true"
160
+
161
+ # --------------------------------------------------------------------------
162
+ # STANDALONE SPARK CLUSTER
163
+ # --------------------------------------------------------------------------
164
+ # Connects to self-managed Spark clusters (on-premises or cloud VMs).
165
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
166
+ # Workers download JARs from Maven Central at session start.
167
+ #
168
+ # Requirements:
169
+ # - Spark master accessible at spark://host:port
170
+ # - Workers must have network access to Maven Central
171
+ #
172
+ # Cost: Infrastructure-dependent (your own hardware/VMs)
173
+ # Best for: On-premises deployments, custom Spark configurations
174
+ # --------------------------------------------------------------------------
175
+ # spark-cluster:
176
+ # type: spark
177
+ # description: "Standalone Spark Cluster"
178
+ # config:
179
+ # master: "spark://master-node:7077" # Required: Spark master URL
180
+ # spark.driver.memory: "4g"
181
+ # spark.executor.memory: "8g"
182
+ # spark.executor.cores: "4"
183
+ # spark.executor.instances: "10"
184
+
185
+ # --------------------------------------------------------------------------
186
+ # HIGH-MEMORY LOCAL SPARK
187
+ # --------------------------------------------------------------------------
188
+ # For larger local workloads (requires more system RAM).
189
+ # Same JAR provisioning as spark-local (local file paths).
190
+ #
191
+ # Cost: Free (runs on your local machine)
192
+ # Best for: Larger datasets on powerful workstations
193
+ # --------------------------------------------------------------------------
194
+ # spark-local-large:
195
+ # type: spark
196
+ # description: "High-memory local Spark for large datasets"
197
+ # config:
198
+ # master: "local[*]" # Use all available cores
199
+ # spark.driver.memory: "8g"
200
+ # spark.executor.memory: "8g"
201
+ # spark.sql.shuffle.partitions: "200"
202
+ # spark.sql.adaptive.enabled: "true"
203
+ # spark.sql.adaptive.coalescePartitions.enabled: "true"
204
+ # spark.sql.adaptive.skewJoin.enabled: "true"
205
+ # spark.memory.fraction: "0.8"
206
+ # spark.memory.storageFraction: "0.3"
207
+
208
+ # ============================================================================
209
+ # CONFIGURATION REFERENCE
210
+ # ============================================================================
211
+ # Common Spark configurations:
212
+ #
213
+ # Memory:
214
+ # spark.driver.memory: "4g" # Driver memory (default 1g)
215
+ # spark.executor.memory: "4g" # Executor memory (default 1g)
216
+ # spark.memory.fraction: "0.6" # Fraction for execution/storage
217
+ #
218
+ # Parallelism:
219
+ # spark.executor.cores: "4" # Cores per executor
220
+ # spark.executor.instances: "4" # Number of executors
221
+ # spark.sql.shuffle.partitions: "200" # Shuffle partitions
222
+ # spark.default.parallelism: "100" # Default parallelism
223
+ #
224
+ # Arrow (PyArrow integration):
225
+ # spark.sql.execution.arrow.pyspark.enabled: "true"
226
+ # spark.sql.execution.arrow.maxRecordsPerBatch: "10000"
227
+ #
228
+ # Adaptive Query Execution (Spark 3.0+):
229
+ # spark.sql.adaptive.enabled: "true"
230
+ # spark.sql.adaptive.coalescePartitions.enabled: "true"
231
+ # spark.sql.adaptive.skewJoin.enabled: "true"
232
+ #
233
+ # JDBC JAR Provisioning (v0.5.98):
234
+ # Local Spark:
235
+ # - Uses spark.jars with local file paths
236
+ # - Fast startup (no download needed)
237
+ # - JARs auto-discovered from profiles.yml
238
+ #
239
+ # Remote Clusters (EMR, Dataproc, Standalone):
240
+ # - Uses spark.jars.packages with Maven coordinates
241
+ # - Workers download JARs at session start
242
+ # - Supported databases: PostgreSQL, MySQL, Oracle, SQL Server,
243
+ # Snowflake, Redshift, BigQuery, Teradata, DB2, and 30+ more
244
+ # ============================================================================
245
+ """
246
+
247
+
248
+ @dataclass
249
+ class ComputeCluster:
250
+ """Configuration for an external compute cluster."""
251
+
252
+ name: str # Cluster identifier
253
+ type: str # 'spark' (currently only Spark supported for external)
254
+ config: Dict[str, Any] = field(default_factory=dict) # Cluster-specific config
255
+ description: Optional[str] = None
256
+ cost_per_hour: Optional[float] = None # Estimated cost per hour (USD)
257
+
258
+ def to_dict(self) -> Dict[str, Any]:
259
+ """Serialize to dictionary."""
260
+ result = {
261
+ "type": self.type,
262
+ "config": self.config,
263
+ }
264
+ if self.description:
265
+ result["description"] = self.description
266
+ if self.cost_per_hour is not None:
267
+ result["cost_per_hour"] = self.cost_per_hour
268
+ return result
269
+
270
+ @classmethod
271
+ def from_dict(cls, name: str, data: Dict[str, Any]) -> "ComputeCluster":
272
+ """Deserialize from dictionary."""
273
+ return cls(
274
+ name=name,
275
+ type=data.get("type", "spark"),
276
+ config=data.get("config", {}),
277
+ description=data.get("description"),
278
+ cost_per_hour=data.get("cost_per_hour"),
279
+ )
280
+
281
+ def detect_platform(self) -> SparkPlatform:
282
+ """
283
+ Detect Spark platform from configuration keys.
284
+
285
+ v0.51.2: Removed Databricks support.
286
+ Detection order (most specific first):
287
+ 1. Dataproc: project + region + cluster
288
+ 2. EMR: master=yarn (without Dataproc keys)
289
+ 3. Standalone: master=spark://
290
+ 4. Local: master=local[*] or no master
291
+ 5. External: fallback for unknown configurations
292
+
293
+ :returns: SparkPlatform enum value
294
+ """
295
+ if self.type != "spark":
296
+ return SparkPlatform.EXTERNAL
297
+
298
+ config_keys = set(self.config.keys())
299
+
300
+ # 1. Dataproc: has project, region, and cluster
301
+ if all(k in config_keys for k in ["project", "region", "cluster"]):
302
+ return SparkPlatform.DATAPROC
303
+
304
+ # Check master value for remaining platforms
305
+ if "master" in config_keys:
306
+ master = str(self.config["master"]).lower()
307
+
308
+ # 2. EMR: master=yarn (without Dataproc keys)
309
+ if master == "yarn":
310
+ return SparkPlatform.EMR
311
+
312
+ # 3. Standalone: master=spark://
313
+ if master.startswith("spark://"):
314
+ return SparkPlatform.STANDALONE
315
+
316
+ # 4. Local: master=local[*]
317
+ if master.startswith("local"):
318
+ return SparkPlatform.LOCAL
319
+
320
+ # 5. External: unknown master format
321
+ return SparkPlatform.EXTERNAL
322
+
323
+ # Default to local (no master specified)
324
+ return SparkPlatform.LOCAL
325
+
326
+
327
+ class ComputeRegistry:
328
+ """
329
+ Registry for managing external compute clusters.
330
+
331
+ v0.55.0: Clusters stored in <project>/.dvt/computes.yml (project-level)
332
+ Managed exclusively via `dvt compute` CLI commands.
333
+ """
334
+
335
+ def __init__(self, project_dir=None):
336
+ """
337
+ Initialize compute registry.
338
+
339
+ :param project_dir: Path to project root directory (str or Path)
340
+ """
341
+ self.project_dir = str(project_dir) if project_dir else os.getcwd()
342
+
343
+ # v0.55.0: Project-level paths
344
+ self.project_dvt_dir = get_project_dvt_dir(self.project_dir)
345
+ self.compute_file = self.project_dvt_dir / "computes.yml"
346
+ self.jdbc_jars_dir = self.project_dvt_dir / "jdbc_jars"
347
+
348
+ self._clusters: Dict[str, ComputeCluster] = {}
349
+ self._target_compute: Optional[str] = None
350
+ self._load()
351
+
352
+ def _load(self) -> None:
353
+ """Load clusters from storage.
354
+
355
+ v0.55.0: Only project-level <project>/.dvt/computes.yml is supported.
356
+ """
357
+ # Load from project-level YAML file if it exists
358
+ if self.compute_file.exists():
359
+ self._load_from_yaml()
360
+ return
361
+
362
+ # No file exists - load defaults (will be saved when ensure_config_exists is called)
363
+ self._load_defaults()
364
+
365
+ def _load_from_yaml(self) -> None:
366
+ """Load clusters from YAML file."""
367
+ try:
368
+ with open(self.compute_file, "r") as f:
369
+ data = yaml.safe_load(f)
370
+
371
+ if not data:
372
+ self._load_defaults()
373
+ return
374
+
375
+ # Parse target_compute (default compute engine)
376
+ self._target_compute = data.get("target_compute", "spark-local")
377
+
378
+ # Parse computes
379
+ computes_data = data.get("computes", {})
380
+ for name, cluster_data in computes_data.items():
381
+ if cluster_data: # Skip None/empty entries
382
+ cluster = ComputeCluster.from_dict(name, cluster_data)
383
+ self._clusters[cluster.name] = cluster
384
+
385
+ # If no computes defined, use defaults
386
+ if not self._clusters:
387
+ self._load_defaults()
388
+
389
+ except Exception as e:
390
+ raise DbtRuntimeError(f"Failed to load compute registry: {str(e)}") from e
391
+
392
+ def _load_defaults(self) -> None:
393
+ """Load default out-of-box compute engines."""
394
+ data = yaml.safe_load(DEFAULT_COMPUTES_YAML)
395
+
396
+ self._target_compute = data.get("target_compute", "spark-local")
397
+
398
+ computes_data = data.get("computes", {})
399
+ for name, cluster_data in computes_data.items():
400
+ if cluster_data: # Skip None entries (commented out samples)
401
+ cluster = ComputeCluster.from_dict(name, cluster_data)
402
+ self._clusters[cluster.name] = cluster
403
+
404
+ def _save(self) -> None:
405
+ """Save clusters to YAML file at project-level."""
406
+ # Ensure project .dvt directory exists
407
+ self.project_dvt_dir.mkdir(parents=True, exist_ok=True)
408
+
409
+ # Build the YAML content with active computes
410
+ computes_dict = {}
411
+ for cluster in self._clusters.values():
412
+ computes_dict[cluster.name] = cluster.to_dict()
413
+
414
+ # If file exists, try to preserve comments by updating only the active section
415
+ # For simplicity, we'll write the full template with active computes
416
+ yaml_content = f"""# ============================================================================
417
+ # DVT Compute Engines Configuration
418
+ # ============================================================================
419
+ # This file defines Spark compute engines for federated query execution.
420
+ # Edit with: dvt compute edit
421
+ # Validate with: dvt compute validate
422
+ # Test with: dvt compute test
423
+ # ============================================================================
424
+
425
+ # Default compute engine (must match a name in 'computes' section)
426
+ target_compute: {self._target_compute or 'spark-local'}
427
+
428
+ computes:
429
+ """
430
+ # Add active computes
431
+ for name, cluster in self._clusters.items():
432
+ yaml_content += f"\n {name}:\n"
433
+ yaml_content += f" type: {cluster.type}\n"
434
+ if cluster.description:
435
+ yaml_content += f' description: "{cluster.description}"\n'
436
+ yaml_content += " config:\n"
437
+ for key, value in cluster.config.items():
438
+ yaml_content += f' {key}: "{value}"\n'
439
+
440
+ with open(self.compute_file, "w") as f:
441
+ f.write(yaml_content)
442
+
443
+ def get_config_path(self) -> Path:
444
+ """Get the path to the computes.yml file."""
445
+ return self.compute_file
446
+
447
+ def ensure_config_exists(self) -> Path:
448
+ """Ensure the config file exists at project-level and return its path."""
449
+ if not self.compute_file.exists():
450
+ self._load_defaults()
451
+ # Write full template with samples to project-level
452
+ self.project_dvt_dir.mkdir(parents=True, exist_ok=True)
453
+ with open(self.compute_file, "w") as f:
454
+ f.write(DEFAULT_COMPUTES_YAML)
455
+ return self.compute_file
456
+
457
+ @property
458
+ def target_compute(self) -> str:
459
+ """Get the default target compute engine."""
460
+ return self._target_compute or "spark-local"
461
+
462
+ @target_compute.setter
463
+ def target_compute(self, value: str) -> None:
464
+ """Set the default target compute engine."""
465
+ if value not in self._clusters:
466
+ raise DbtRuntimeError(
467
+ f"Cannot set target_compute to '{value}': compute engine not found. "
468
+ f"Available engines: {', '.join(self._clusters.keys())}"
469
+ )
470
+ self._target_compute = value
471
+ self._save()
472
+
473
+ def get(self, name: str) -> Optional[ComputeCluster]:
474
+ """
475
+ Get a compute cluster by name.
476
+
477
+ :param name: Cluster name
478
+ :returns: ComputeCluster or None if not found
479
+ """
480
+ return self._clusters.get(name)
481
+
482
+ def list(self) -> List[ComputeCluster]:
483
+ """
484
+ List all registered clusters.
485
+
486
+ :returns: List of ComputeCluster objects
487
+ """
488
+ return list(self._clusters.values())
489
+
490
+ def exists(self, name: str) -> bool:
491
+ """
492
+ Check if a cluster exists.
493
+
494
+ :param name: Cluster name
495
+ :returns: True if cluster exists
496
+ """
497
+ return name in self._clusters
498
+
499
+ @staticmethod
500
+ def ensure_jdbc_jars_dir(project_dir: str) -> Path:
501
+ """
502
+ Ensure the project-level .dvt/jdbc_jars/ directory exists.
503
+
504
+ :param project_dir: Path to project root directory
505
+ :returns: Path to the jdbc_jars directory
506
+ """
507
+ jdbc_jars_dir = get_project_dvt_dir(project_dir) / "jdbc_jars"
508
+ jdbc_jars_dir.mkdir(parents=True, exist_ok=True)
509
+ return jdbc_jars_dir
510
+
511
+ def get_jdbc_jars_dir(self) -> Path:
512
+ """Get the project-level jdbc_jars directory path."""
513
+ return self.jdbc_jars_dir