dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
dbt/config/catalogs.py ADDED
@@ -0,0 +1,94 @@
1
+ import os
2
+ from copy import deepcopy
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from dbt.artifacts.resources import Catalog, CatalogWriteIntegrationConfig
6
+ from dbt.clients.yaml_helper import load_yaml_text
7
+ from dbt.config.renderer import SecretRenderer
8
+ from dbt.constants import CATALOGS_FILE_NAME
9
+ from dbt.exceptions import YamlLoadError
10
+ from dbt_common.clients.system import load_file_contents
11
+ from dbt_common.exceptions import CompilationError, DbtValidationError
12
+
13
+
14
+ def load_catalogs_yml(project_dir: str, project_name: str) -> Dict[str, Any]:
15
+ path = os.path.join(project_dir, CATALOGS_FILE_NAME)
16
+
17
+ if os.path.isfile(path):
18
+ try:
19
+ contents = load_file_contents(path, strip=False)
20
+ yaml_content = load_yaml_text(contents)
21
+
22
+ if not yaml_content:
23
+ raise DbtValidationError(f"The file at {path} is empty")
24
+
25
+ return yaml_content
26
+ except DbtValidationError as e:
27
+ raise YamlLoadError(project_name=project_name, path=CATALOGS_FILE_NAME, exc=e)
28
+
29
+ return {}
30
+
31
+
32
+ def load_single_catalog(raw_catalog: Dict[str, Any], renderer: SecretRenderer) -> Catalog:
33
+ try:
34
+ rendered_catalog = renderer.render_data(raw_catalog)
35
+ except CompilationError as exc:
36
+ raise DbtValidationError(str(exc)) from exc
37
+
38
+ Catalog.validate(rendered_catalog)
39
+
40
+ write_integrations = []
41
+ write_integration_names = set()
42
+
43
+ for raw_integration in rendered_catalog.get("write_integrations", []):
44
+ if raw_integration["name"] in write_integration_names:
45
+ raise DbtValidationError(
46
+ f"Catalog '{rendered_catalog['name']}' cannot have multiple 'write_integrations' with the same name: '{raw_integration['name']}'."
47
+ )
48
+
49
+ # We're going to let the adapter validate the integration config
50
+ write_integrations.append(
51
+ CatalogWriteIntegrationConfig(**raw_integration, catalog_name=raw_catalog["name"])
52
+ )
53
+ write_integration_names.add(raw_integration["name"])
54
+
55
+ # Validate + set default active_write_integration if unset
56
+ active_write_integration = rendered_catalog.get("active_write_integration")
57
+ valid_write_integration_names = [integration.name for integration in write_integrations]
58
+
59
+ if not active_write_integration:
60
+ if len(valid_write_integration_names) == 1:
61
+ active_write_integration = write_integrations[0].name
62
+ else:
63
+ raise DbtValidationError(
64
+ f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' when multiple 'write_integrations' are provided."
65
+ )
66
+ else:
67
+ if active_write_integration not in valid_write_integration_names:
68
+ raise DbtValidationError(
69
+ f"Catalog '{rendered_catalog['name']}' must specify an 'active_write_integration' from its set of defined 'write_integrations': {valid_write_integration_names}. Got: '{active_write_integration}'."
70
+ )
71
+
72
+ return Catalog(
73
+ name=raw_catalog["name"],
74
+ active_write_integration=active_write_integration,
75
+ write_integrations=write_integrations,
76
+ )
77
+
78
+
79
+ def load_catalogs(project_dir: str, project_name: str, cli_vars: Dict[str, Any]) -> List[Catalog]:
80
+ raw_catalogs = load_catalogs_yml(project_dir, project_name).get("catalogs", [])
81
+ catalogs_renderer = SecretRenderer(cli_vars)
82
+
83
+ return [load_single_catalog(raw_catalog, catalogs_renderer) for raw_catalog in raw_catalogs]
84
+
85
+
86
+ def get_active_write_integration(catalog: Catalog) -> Optional[CatalogWriteIntegrationConfig]:
87
+ for integration in catalog.write_integrations:
88
+ if integration.name == catalog.active_write_integration:
89
+ active_integration = deepcopy(integration)
90
+ active_integration.catalog_name = active_integration.name
91
+ active_integration.name = catalog.name
92
+ return active_integration
93
+
94
+ return None
Binary file
dbt/config/compute.py ADDED
@@ -0,0 +1,547 @@
1
+ """
2
+ Compute Cluster Registry
3
+
4
+ Manages external compute cluster configurations for DVT.
5
+
6
+ v0.5.97: Computes stored in ~/.dvt/.data/computes.yml (YAML format)
7
+ Managed exclusively via `dvt compute` CLI commands.
8
+ Contains comprehensive commented samples for all platforms.
9
+ """
10
+
11
+ import os
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ import yaml
18
+ from dbt_common.exceptions import DbtRuntimeError
19
+
20
+
21
+ def get_dvt_dir() -> Path:
22
+ """Get the DVT config directory (~/.dvt/)."""
23
+ # Check DVT_PROFILES_DIR env var first (same as profiles)
24
+ profiles_dir = os.environ.get("DVT_PROFILES_DIR")
25
+ if profiles_dir:
26
+ return Path(profiles_dir)
27
+ # Fall back to ~/.dvt/
28
+ return Path.home() / ".dvt"
29
+
30
+
31
+ def get_internal_data_dir() -> Path:
32
+ """Get the DVT internal data directory (~/.dvt/.data/)."""
33
+ return get_dvt_dir() / ".data"
34
+
35
+
36
+ class SparkPlatform(Enum):
37
+ """Spark platform types for connection strategies.
38
+
39
+ v0.51.2: Removed DATABRICKS (serverless cannot read external JDBC sources).
40
+ """
41
+
42
+ LOCAL = "local"
43
+ EMR = "emr"
44
+ DATAPROC = "dataproc"
45
+ STANDALONE = "standalone" # Self-managed Spark clusters (spark://)
46
+ EXTERNAL = "external" # Generic external cluster (fallback)
47
+
48
+
49
+ # Default computes.yml template with comprehensive commented samples
50
+ DEFAULT_COMPUTES_YAML = """# ============================================================================
51
+ # DVT Compute Engines Configuration (v0.5.98)
52
+ # ============================================================================
53
+ # This file defines Spark compute engines for federated query execution.
54
+ #
55
+ # Commands:
56
+ # dvt compute test Test connectivity to all compute engines
57
+ # dvt compute edit Open this file in your editor
58
+ # dvt compute validate Validate YAML syntax
59
+ #
60
+ # JDBC JAR Provisioning (v0.5.98):
61
+ # - Local Spark: Uses spark.jars with local file paths (fast startup)
62
+ # - Remote clusters: Uses spark.jars.packages with Maven coordinates
63
+ # (workers download JARs from Maven Central at session start)
64
+ #
65
+ # Platform Detection:
66
+ # DVT auto-detects the platform from config keys:
67
+ # - Dataproc: project + region + cluster
68
+ # - EMR: master=yarn (without Dataproc keys)
69
+ # - Standalone: master=spark://...
70
+ # - Local: master=local[*] or no master
71
+ # ============================================================================
72
+
73
+ # Default compute engine (must match a name in 'computes' section)
74
+ target_compute: spark-local
75
+
76
+ # ============================================================================
77
+ # COMPUTE ENGINES
78
+ # ============================================================================
79
+ # Each compute engine must have:
80
+ # - type: 'spark' (currently the only supported type)
81
+ # - config: Spark configuration options
82
+ # - description: (optional) Human-readable description
83
+ # ============================================================================
84
+
85
+ computes:
86
+
87
+ # --------------------------------------------------------------------------
88
+ # LOCAL SPARK (Default - Works out of the box)
89
+ # --------------------------------------------------------------------------
90
+ # Embedded PySpark for development and small-medium datasets.
91
+ # Uses spark.jars with local file paths for fast startup.
92
+ # JDBC JARs are auto-discovered from profiles.yml connections.
93
+ #
94
+ # Cost: Free (runs on your local machine)
95
+ # Best for: Development, testing, datasets < 10GB
96
+ # --------------------------------------------------------------------------
97
+ spark-local:
98
+ type: spark
99
+ description: "Local Spark for development and testing"
100
+ config:
101
+ master: "local[2]" # Use 2 CPU cores (local[*] for all)
102
+ spark.driver.memory: "2g" # Driver memory
103
+ spark.executor.memory: "2g" # Executor memory
104
+ spark.ui.enabled: "false" # Disable Spark UI
105
+ spark.ui.showConsoleProgress: "false" # No progress bars
106
+ # Spark 4.0 legacy compatibility flags
107
+ spark.sql.legacy.postgres.datetimeMapping.enabled: "true"
108
+ spark.sql.legacy.mysql.timestampNTZMapping.enabled: "true"
109
+ spark.sql.legacy.oracle.timestampMapping.enabled: "true"
110
+ spark.sql.legacy.mssqlserver.numericMapping.enabled: "true"
111
+ # Performance optimizations
112
+ spark.sql.shuffle.partitions: "8"
113
+ spark.sql.execution.arrow.pyspark.enabled: "true"
114
+ spark.sql.execution.arrow.pyspark.fallback.enabled: "true"
115
+ spark.sql.adaptive.enabled: "true"
116
+ spark.sql.adaptive.coalescePartitions.enabled: "true"
117
+
118
+ # --------------------------------------------------------------------------
119
+ # AWS EMR (Elastic MapReduce)
120
+ # --------------------------------------------------------------------------
121
+ # Connects to AWS EMR clusters via YARN.
122
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
123
+ #
124
+ # Requirements:
125
+ # - AWS credentials configured (aws configure or IAM role)
126
+ # - EMR cluster must be running
127
+ # - Network access to EMR master node
128
+ #
129
+ # Cost: ~$1.20/hr (typical 5-node m5.xlarge cluster)
130
+ # Best for: AWS-native workloads, S3 data integration
131
+ # --------------------------------------------------------------------------
132
+ # emr-cluster:
133
+ # type: spark
134
+ # description: "AWS EMR Spark Cluster"
135
+ # config:
136
+ # master: "yarn" # Required: YARN resource manager
137
+ # spark.submit.deployMode: "client" # Client mode for interactive
138
+ # spark.driver.memory: "4g"
139
+ # spark.executor.memory: "8g"
140
+ # spark.executor.instances: "4"
141
+ # spark.dynamicAllocation.enabled: "true"
142
+
143
+ # --------------------------------------------------------------------------
144
+ # GCP DATAPROC (Google Cloud Spark)
145
+ # --------------------------------------------------------------------------
146
+ # Connects to GCP Dataproc clusters via YARN.
147
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
148
+ #
149
+ # Requirements:
150
+ # - gcloud SDK configured (gcloud auth login)
151
+ # - Dataproc cluster must be running
152
+ # - Network access to Dataproc master
153
+ #
154
+ # Cost: ~$1.15/hr (typical 5-node n1-standard-4 cluster)
155
+ # Best for: GCP-native workloads, BigQuery/GCS integration
156
+ # --------------------------------------------------------------------------
157
+ # dataproc-cluster:
158
+ # type: spark
159
+ # description: "GCP Dataproc Cluster"
160
+ # config:
161
+ # project: "my-gcp-project" # Required: GCP project ID
162
+ # region: "us-central1" # Required: Dataproc region
163
+ # cluster: "my-dataproc-cluster" # Required: Cluster name
164
+ # spark.driver.memory: "4g"
165
+ # spark.executor.memory: "8g"
166
+ # spark.dynamicAllocation.enabled: "true"
167
+
168
+ # --------------------------------------------------------------------------
169
+ # STANDALONE SPARK CLUSTER
170
+ # --------------------------------------------------------------------------
171
+ # Connects to self-managed Spark clusters (on-premises or cloud VMs).
172
+ # JDBC drivers are provisioned via spark.jars.packages (Maven).
173
+ # Workers download JARs from Maven Central at session start.
174
+ #
175
+ # Requirements:
176
+ # - Spark master accessible at spark://host:port
177
+ # - Workers must have network access to Maven Central
178
+ #
179
+ # Cost: Infrastructure-dependent (your own hardware/VMs)
180
+ # Best for: On-premises deployments, custom Spark configurations
181
+ # --------------------------------------------------------------------------
182
+ # spark-cluster:
183
+ # type: spark
184
+ # description: "Standalone Spark Cluster"
185
+ # config:
186
+ # master: "spark://master-node:7077" # Required: Spark master URL
187
+ # spark.driver.memory: "4g"
188
+ # spark.executor.memory: "8g"
189
+ # spark.executor.cores: "4"
190
+ # spark.executor.instances: "10"
191
+
192
+ # --------------------------------------------------------------------------
193
+ # HIGH-MEMORY LOCAL SPARK
194
+ # --------------------------------------------------------------------------
195
+ # For larger local workloads (requires more system RAM).
196
+ # Same JAR provisioning as spark-local (local file paths).
197
+ #
198
+ # Cost: Free (runs on your local machine)
199
+ # Best for: Larger datasets on powerful workstations
200
+ # --------------------------------------------------------------------------
201
+ # spark-local-large:
202
+ # type: spark
203
+ # description: "High-memory local Spark for large datasets"
204
+ # config:
205
+ # master: "local[*]" # Use all available cores
206
+ # spark.driver.memory: "8g"
207
+ # spark.executor.memory: "8g"
208
+ # spark.sql.shuffle.partitions: "200"
209
+ # spark.sql.adaptive.enabled: "true"
210
+ # spark.sql.adaptive.coalescePartitions.enabled: "true"
211
+ # spark.sql.adaptive.skewJoin.enabled: "true"
212
+ # spark.memory.fraction: "0.8"
213
+ # spark.memory.storageFraction: "0.3"
214
+
215
+ # ============================================================================
216
+ # CONFIGURATION REFERENCE
217
+ # ============================================================================
218
+ # Common Spark configurations:
219
+ #
220
+ # Memory:
221
+ # spark.driver.memory: "4g" # Driver memory (default 1g)
222
+ # spark.executor.memory: "4g" # Executor memory (default 1g)
223
+ # spark.memory.fraction: "0.6" # Fraction for execution/storage
224
+ #
225
+ # Parallelism:
226
+ # spark.executor.cores: "4" # Cores per executor
227
+ # spark.executor.instances: "4" # Number of executors
228
+ # spark.sql.shuffle.partitions: "200" # Shuffle partitions
229
+ # spark.default.parallelism: "100" # Default parallelism
230
+ #
231
+ # Arrow (PyArrow integration):
232
+ # spark.sql.execution.arrow.pyspark.enabled: "true"
233
+ # spark.sql.execution.arrow.maxRecordsPerBatch: "10000"
234
+ #
235
+ # Adaptive Query Execution (Spark 3.0+):
236
+ # spark.sql.adaptive.enabled: "true"
237
+ # spark.sql.adaptive.coalescePartitions.enabled: "true"
238
+ # spark.sql.adaptive.skewJoin.enabled: "true"
239
+ #
240
+ # JDBC JAR Provisioning (v0.5.98):
241
+ # Local Spark:
242
+ # - Uses spark.jars with local file paths
243
+ # - Fast startup (no download needed)
244
+ # - JARs auto-discovered from profiles.yml
245
+ #
246
+ # Remote Clusters (EMR, Dataproc, Standalone):
247
+ # - Uses spark.jars.packages with Maven coordinates
248
+ # - Workers download JARs at session start
249
+ # - Supported databases: PostgreSQL, MySQL, Oracle, SQL Server,
250
+ # Snowflake, Redshift, BigQuery, Teradata, DB2, and 30+ more
251
+ # ============================================================================
252
+ """
253
+
254
+
255
+ @dataclass
256
+ class ComputeCluster:
257
+ """Configuration for an external compute cluster."""
258
+
259
+ name: str # Cluster identifier
260
+ type: str # 'spark' (currently only Spark supported for external)
261
+ config: Dict[str, Any] = field(default_factory=dict) # Cluster-specific config
262
+ description: Optional[str] = None
263
+ cost_per_hour: Optional[float] = None # Estimated cost per hour (USD)
264
+
265
+ def to_dict(self) -> Dict[str, Any]:
266
+ """Serialize to dictionary."""
267
+ result = {
268
+ "type": self.type,
269
+ "config": self.config,
270
+ }
271
+ if self.description:
272
+ result["description"] = self.description
273
+ if self.cost_per_hour is not None:
274
+ result["cost_per_hour"] = self.cost_per_hour
275
+ return result
276
+
277
+ @classmethod
278
+ def from_dict(cls, name: str, data: Dict[str, Any]) -> "ComputeCluster":
279
+ """Deserialize from dictionary."""
280
+ return cls(
281
+ name=name,
282
+ type=data.get("type", "spark"),
283
+ config=data.get("config", {}),
284
+ description=data.get("description"),
285
+ cost_per_hour=data.get("cost_per_hour"),
286
+ )
287
+
288
+ def detect_platform(self) -> SparkPlatform:
289
+ """
290
+ Detect Spark platform from configuration keys.
291
+
292
+ v0.51.2: Removed Databricks support.
293
+ Detection order (most specific first):
294
+ 1. Dataproc: project + region + cluster
295
+ 2. EMR: master=yarn (without Dataproc keys)
296
+ 3. Standalone: master=spark://
297
+ 4. Local: master=local[*] or no master
298
+ 5. External: fallback for unknown configurations
299
+
300
+ :returns: SparkPlatform enum value
301
+ """
302
+ if self.type != "spark":
303
+ return SparkPlatform.EXTERNAL
304
+
305
+ config_keys = set(self.config.keys())
306
+
307
+ # 1. Dataproc: has project, region, and cluster
308
+ if all(k in config_keys for k in ["project", "region", "cluster"]):
309
+ return SparkPlatform.DATAPROC
310
+
311
+ # Check master value for remaining platforms
312
+ if "master" in config_keys:
313
+ master = str(self.config["master"]).lower()
314
+
315
+ # 2. EMR: master=yarn (without Dataproc keys)
316
+ if master == "yarn":
317
+ return SparkPlatform.EMR
318
+
319
+ # 3. Standalone: master=spark://
320
+ if master.startswith("spark://"):
321
+ return SparkPlatform.STANDALONE
322
+
323
+ # 4. Local: master=local[*]
324
+ if master.startswith("local"):
325
+ return SparkPlatform.LOCAL
326
+
327
+ # 5. External: unknown master format
328
+ return SparkPlatform.EXTERNAL
329
+
330
+ # Default to local (no master specified)
331
+ return SparkPlatform.LOCAL
332
+
333
+
334
+ class ComputeRegistry:
335
+ """
336
+ Registry for managing external compute clusters.
337
+
338
+ v0.5.97: Clusters stored in ~/.dvt/.data/computes.yml (YAML format)
339
+ Managed exclusively via `dvt compute` CLI commands.
340
+ """
341
+
342
+ def __init__(self, project_dir: Optional[str] = None):
343
+ """
344
+ Initialize compute registry.
345
+
346
+ :param project_dir: Path to project root directory (for JDBC jars)
347
+ """
348
+ self.project_dir = project_dir or os.getcwd()
349
+ self.data_dir = get_internal_data_dir()
350
+ self.compute_file = self.data_dir / "computes.yml"
351
+ # Also check for old JSON file for migration
352
+ self.old_compute_file = self.data_dir / "computes.json"
353
+ # JDBC jars stay at project level
354
+ self.jdbc_jars_dir = os.path.join(self.project_dir, ".dvt", "jdbc_jars")
355
+ self._clusters: Dict[str, ComputeCluster] = {}
356
+ self._target_compute: Optional[str] = None
357
+ self._load()
358
+
359
+ def _load(self) -> None:
360
+ """Load clusters from internal storage."""
361
+ # First check for YAML file (new format)
362
+ if self.compute_file.exists():
363
+ self._load_from_yaml()
364
+ return
365
+
366
+ # Check for old JSON file and migrate
367
+ if self.old_compute_file.exists():
368
+ self._migrate_from_json()
369
+ return
370
+
371
+ # No files exist - create defaults
372
+ self._load_defaults()
373
+ self._save()
374
+
375
+ def _load_from_yaml(self) -> None:
376
+ """Load clusters from YAML file."""
377
+ try:
378
+ with open(self.compute_file, "r") as f:
379
+ data = yaml.safe_load(f)
380
+
381
+ if not data:
382
+ self._load_defaults()
383
+ return
384
+
385
+ # Parse target_compute (default compute engine)
386
+ self._target_compute = data.get("target_compute", "spark-local")
387
+
388
+ # Parse computes
389
+ computes_data = data.get("computes", {})
390
+ for name, cluster_data in computes_data.items():
391
+ if cluster_data: # Skip None/empty entries
392
+ cluster = ComputeCluster.from_dict(name, cluster_data)
393
+ self._clusters[cluster.name] = cluster
394
+
395
+ # If no computes defined, use defaults
396
+ if not self._clusters:
397
+ self._load_defaults()
398
+
399
+ except Exception as e:
400
+ raise DbtRuntimeError(f"Failed to load compute registry: {str(e)}") from e
401
+
402
+ def _migrate_from_json(self) -> None:
403
+ """Migrate from old JSON format to YAML."""
404
+ import json
405
+
406
+ try:
407
+ with open(self.old_compute_file, "r") as f:
408
+ data = json.load(f)
409
+
410
+ if data:
411
+ self._target_compute = data.get("target_compute", "spark-local")
412
+ computes_data = data.get("computes", {})
413
+ for name, cluster_data in computes_data.items():
414
+ if cluster_data:
415
+ cluster_data["name"] = name
416
+ cluster = ComputeCluster.from_dict(name, cluster_data)
417
+ self._clusters[cluster.name] = cluster
418
+
419
+ if not self._clusters:
420
+ self._load_defaults()
421
+
422
+ # Save in new YAML format
423
+ self._save()
424
+
425
+ # Remove old JSON file
426
+ self.old_compute_file.unlink()
427
+
428
+ except Exception:
429
+ self._load_defaults()
430
+ self._save()
431
+
432
+ def _load_defaults(self) -> None:
433
+ """Load default out-of-box compute engines."""
434
+ data = yaml.safe_load(DEFAULT_COMPUTES_YAML)
435
+
436
+ self._target_compute = data.get("target_compute", "spark-local")
437
+
438
+ computes_data = data.get("computes", {})
439
+ for name, cluster_data in computes_data.items():
440
+ if cluster_data: # Skip None entries (commented out samples)
441
+ cluster = ComputeCluster.from_dict(name, cluster_data)
442
+ self._clusters[cluster.name] = cluster
443
+
444
+ def _save(self) -> None:
445
+ """Save clusters to YAML file while preserving comments."""
446
+ # Ensure data directory exists
447
+ os.makedirs(self.data_dir, exist_ok=True)
448
+
449
+ # Build the YAML content with active computes
450
+ computes_dict = {}
451
+ for cluster in self._clusters.values():
452
+ computes_dict[cluster.name] = cluster.to_dict()
453
+
454
+ # If file exists, try to preserve comments by updating only the active section
455
+ # For simplicity, we'll write the full template with active computes
456
+ yaml_content = f"""# ============================================================================
457
+ # DVT Compute Engines Configuration
458
+ # ============================================================================
459
+ # This file defines Spark compute engines for federated query execution.
460
+ # Edit with: dvt compute edit
461
+ # Validate with: dvt compute validate
462
+ # Test with: dvt compute test
463
+ # ============================================================================
464
+
465
+ # Default compute engine (must match a name in 'computes' section)
466
+ target_compute: {self._target_compute or 'spark-local'}
467
+
468
+ computes:
469
+ """
470
+ # Add active computes
471
+ for name, cluster in self._clusters.items():
472
+ yaml_content += f"\n {name}:\n"
473
+ yaml_content += f" type: {cluster.type}\n"
474
+ if cluster.description:
475
+ yaml_content += f' description: "{cluster.description}"\n'
476
+ yaml_content += " config:\n"
477
+ for key, value in cluster.config.items():
478
+ yaml_content += f' {key}: "{value}"\n'
479
+
480
+ with open(self.compute_file, "w") as f:
481
+ f.write(yaml_content)
482
+
483
+ def get_config_path(self) -> Path:
484
+ """Get the path to the computes.yml file."""
485
+ return self.compute_file
486
+
487
+ def ensure_config_exists(self) -> Path:
488
+ """Ensure the config file exists and return its path."""
489
+ if not self.compute_file.exists():
490
+ self._load_defaults()
491
+ # Write full template with samples
492
+ os.makedirs(self.data_dir, exist_ok=True)
493
+ with open(self.compute_file, "w") as f:
494
+ f.write(DEFAULT_COMPUTES_YAML)
495
+ return self.compute_file
496
+
497
+ @property
498
+ def target_compute(self) -> str:
499
+ """Get the default target compute engine."""
500
+ return self._target_compute or "spark-local"
501
+
502
+ @target_compute.setter
503
+ def target_compute(self, value: str) -> None:
504
+ """Set the default target compute engine."""
505
+ if value not in self._clusters:
506
+ raise DbtRuntimeError(
507
+ f"Cannot set target_compute to '{value}': compute engine not found. "
508
+ f"Available engines: {', '.join(self._clusters.keys())}"
509
+ )
510
+ self._target_compute = value
511
+ self._save()
512
+
513
+ def get(self, name: str) -> Optional[ComputeCluster]:
514
+ """
515
+ Get a compute cluster by name.
516
+
517
+ :param name: Cluster name
518
+ :returns: ComputeCluster or None if not found
519
+ """
520
+ return self._clusters.get(name)
521
+
522
+ def list(self) -> List[ComputeCluster]:
523
+ """
524
+ List all registered clusters.
525
+
526
+ :returns: List of ComputeCluster objects
527
+ """
528
+ return list(self._clusters.values())
529
+
530
+ def exists(self, name: str) -> bool:
531
+ """
532
+ Check if a cluster exists.
533
+
534
+ :param name: Cluster name
535
+ :returns: True if cluster exists
536
+ """
537
+ return name in self._clusters
538
+
539
+ @staticmethod
540
+ def ensure_jdbc_jars_dir(project_dir: str) -> None:
541
+ """
542
+ Ensure the project-level .dvt/jdbc_jars/ directory exists.
543
+
544
+ :param project_dir: Path to project root directory
545
+ """
546
+ jdbc_jars_dir = os.path.join(project_dir, ".dvt", "jdbc_jars")
547
+ os.makedirs(jdbc_jars_dir, exist_ok=True)