dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
dbt/task/dvt_seed.py ADDED
@@ -0,0 +1,806 @@
1
+ """
2
+ DVT Seed Task - Spark-powered seed loading with pattern-based transformations.
3
+
4
+ DVT v0.59.0a34: Hybrid approach - file databases use native, network use JDBC.
5
+ Uses DVT's virtualization infrastructure for consistent behavior across all targets.
6
+
7
+ Features:
8
+ 1. Read CSV files with Spark
9
+ 2. Match column values against patterns in value_transformations table
10
+ 3. Apply Spark SQL transformations automatically
11
+ 4. Write to target using best method:
12
+ - File-based databases (DuckDB): Native connection via Pandas/Arrow (avoids locking issues)
13
+ - Network databases (Postgres, etc.): Spark JDBC with DROP CASCADE support
14
+ 5. Rich UI output with progress tracking
15
+
16
+ This ensures consistent behavior whether writing to DuckDB, Postgres, Databricks,
17
+ or any other supported adapter.
18
+ """
19
+
20
+ import re
21
+ import time
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import duckdb
26
+
27
+ from dbt.adapters.factory import get_adapter
28
+ from dbt.artifacts.schemas.results import NodeStatus, RunStatus
29
+ from dbt.artifacts.schemas.run import RunResult
30
+ from dbt.config.runtime import RuntimeConfig
31
+ from dbt.contracts.graph.nodes import SeedNode
32
+ from dbt.events.types import LogSeedResult, LogStartLine
33
+ from dbt.graph import ResourceTypeSelector
34
+ from dbt.node_types import NodeType
35
+ from dbt.task import group_lookup
36
+ from dbt.task.base import BaseRunner
37
+ from dbt.task.dvt_output import DVTMultiBarDisplay, HAS_RICH
38
+ from dbt.task.printer import print_run_end_messages
39
+ from dbt.task.run import RunTask
40
+ from dbt_common.events.base_types import EventLevel
41
+ from dbt_common.events.functions import fire_event
42
+ from dbt_common.exceptions import DbtInternalError
43
+
44
+
45
+ class ValueTransformationRegistry:
46
+ """Registry for value transformation patterns from MDM."""
47
+
48
+ _patterns: Optional[List[Tuple[str, str, str, int]]] = None
49
+
50
+ @classmethod
51
+ def get_patterns(cls) -> List[Tuple[str, str, str, int]]:
52
+ """Load transformation patterns from MDM database."""
53
+ if cls._patterns is not None:
54
+ return cls._patterns
55
+
56
+ # Try packaged registry first, then user MDM
57
+ registry_paths = [
58
+ Path(__file__).parent.parent / "include" / "data" / "adapters_registry.duckdb",
59
+ Path.home() / ".dvt" / ".data" / "mdm.duckdb",
60
+ ]
61
+
62
+ cls._patterns = []
63
+ for path in registry_paths:
64
+ if path.exists():
65
+ try:
66
+ conn = duckdb.connect(str(path), read_only=True)
67
+ # Check if table exists
68
+ tables = conn.execute(
69
+ "SELECT table_name FROM information_schema.tables WHERE table_name = 'value_transformations'"
70
+ ).fetchall()
71
+ if tables:
72
+ result = conn.execute("""
73
+ SELECT pattern, target_type, transform_expr, priority
74
+ FROM value_transformations
75
+ ORDER BY priority DESC
76
+ """).fetchall()
77
+ cls._patterns = [(r[0], r[1], r[2], r[3]) for r in result]
78
+ conn.close()
79
+ if cls._patterns:
80
+ break
81
+ except Exception:
82
+ pass
83
+
84
+ return cls._patterns
85
+
86
+ @classmethod
87
+ def match_pattern(cls, sample_values: List[str]) -> Optional[Tuple[str, str]]:
88
+ """
89
+ Match sample values against transformation patterns.
90
+
91
+ Returns (target_type, transform_expr) if a pattern matches majority of values.
92
+ """
93
+ patterns = cls.get_patterns()
94
+ if not patterns or not sample_values:
95
+ return None
96
+
97
+ # Filter out None/empty values
98
+ valid_values = [v for v in sample_values if v is not None and str(v).strip()]
99
+ if not valid_values:
100
+ return None
101
+
102
+ for pattern, target_type, transform_expr, _ in patterns:
103
+ try:
104
+ regex = re.compile(pattern, re.IGNORECASE)
105
+ matches = sum(1 for v in valid_values if regex.match(str(v).strip()))
106
+ # If 80%+ of values match, use this pattern
107
+ if matches / len(valid_values) >= 0.8:
108
+ return (target_type, transform_expr)
109
+ except re.error:
110
+ continue
111
+
112
+ return None
113
+
114
+
115
+ class DVTSeedRunner(BaseRunner):
116
+ """DVT Seed Runner using Spark for ETL-grade seed loading.
117
+
118
+ Uses unified Spark JDBC for ALL adapters (32+) - same infrastructure
119
+ as dvt run federation path. No adapter-specific code paths.
120
+ """
121
+
122
+ def __init__(self, config: RuntimeConfig, adapter, node: SeedNode, node_index: int, num_nodes: int):
123
+ super().__init__(config, adapter, node, node_index, num_nodes)
124
+ self._spark = None
125
+ self._spark_engine = None
126
+
127
+ def describe_node(self) -> str:
128
+ return f"seed file {self.get_node_representation()}"
129
+
130
+ def before_execute(self) -> None:
131
+ fire_event(
132
+ LogStartLine(
133
+ description=self.describe_node(),
134
+ index=self.node_index,
135
+ total=self.num_nodes,
136
+ node_info=self.node.node_info,
137
+ )
138
+ )
139
+
140
+ def after_execute(self, result) -> None:
141
+ """Print result line after seed execution completes."""
142
+ self.print_result_line(result)
143
+
144
+ def get_node_representation(self) -> str:
145
+ return f"{self.node.schema}.{self.node.alias}"
146
+
147
+ def _get_spark_session(self):
148
+ """Get or create Spark session using configured compute from computes.yml.
149
+
150
+ Compute selection hierarchy (highest to lowest priority):
151
+ 1. CLI --target-compute flag (MUST exist if specified)
152
+ 2. computes.yml target_compute default (MUST exist if specified)
153
+ 3. Fallback to local Spark ONLY if no compute is configured
154
+
155
+ DVT Rule: Invalid compute → Compilation Error (NO fallback)
156
+ """
157
+ if self._spark is not None:
158
+ return self._spark
159
+
160
+ from dbt.compute.engines.spark_engine import SparkEngine
161
+ from dbt.compute.jdbc_utils import set_docker_mode
162
+ from dbt.config.compute import ComputeRegistry
163
+
164
+ # Load compute configuration from project's computes.yml
165
+ project_dir = self.config.project_root
166
+ registry = ComputeRegistry(project_dir=project_dir)
167
+
168
+ # Check for CLI --target-compute override (highest priority)
169
+ cli_target_compute = getattr(self.config.args, 'TARGET_COMPUTE', None)
170
+
171
+ # Determine which compute to use
172
+ target_compute = cli_target_compute or registry.target_compute
173
+
174
+ if target_compute:
175
+ # A compute is specified - it MUST exist (no fallback)
176
+ compute_cluster = registry.get(target_compute)
177
+ if not compute_cluster or not compute_cluster.config:
178
+ available = [c.name for c in registry.list()]
179
+ raise DbtInternalError(
180
+ f"Compute '{target_compute}' not found in computes.yml. "
181
+ f"Available computes: {available}"
182
+ )
183
+
184
+ # DVT v0.59.0a40: Enable Docker mode for standalone clusters with localhost master
185
+ # This rewrites localhost -> host.docker.internal in JDBC URLs
186
+ cluster_config = compute_cluster.config
187
+ master = cluster_config.get("master", "")
188
+ if master.startswith("spark://") and ("localhost" in master or "127.0.0.1" in master):
189
+ set_docker_mode(True)
190
+ else:
191
+ set_docker_mode(False)
192
+
193
+ # Use configured Spark settings (SparkEngine auto-detects platform from config)
194
+ self._spark_engine = SparkEngine(
195
+ spark_config=cluster_config,
196
+ app_name="DVT-Seed",
197
+ )
198
+ else:
199
+ # No compute specified anywhere - fallback to local Spark
200
+ set_docker_mode(False)
201
+ self._spark_engine = SparkEngine(
202
+ mode="embedded",
203
+ app_name="DVT-Seed",
204
+ )
205
+
206
+ # Connect to Spark (creates the session)
207
+ self._spark_engine.connect()
208
+ self._spark = self._spark_engine.spark
209
+ return self._spark
210
+
211
+ def _get_seed_path(self) -> Path:
212
+ """Get the path to the seed CSV file."""
213
+ # Seeds are in the project's seed-paths directory
214
+ seed_paths = self.config.seed_paths
215
+ for seed_path in seed_paths:
216
+ full_path = Path(self.config.project_root) / seed_path / f"{self.node.name}.csv"
217
+ if full_path.exists():
218
+ return full_path
219
+
220
+ # Try original_file_path
221
+ if hasattr(self.node, 'original_file_path') and self.node.original_file_path:
222
+ original = Path(self.config.project_root) / self.node.original_file_path
223
+ if original.exists():
224
+ return original
225
+
226
+ raise FileNotFoundError(f"Seed file not found for {self.node.name}")
227
+
228
+ def _detect_transformations(self, spark_df) -> Dict[str, Tuple[str, str]]:
229
+ """
230
+ Analyze DataFrame columns and detect needed transformations.
231
+
232
+ Returns dict of column_name -> (target_type, transform_expr)
233
+ """
234
+ transformations = {}
235
+
236
+ # Sample first 100 rows for pattern matching
237
+ try:
238
+ sample_rows = spark_df.limit(100).collect()
239
+ except Exception:
240
+ return transformations
241
+
242
+ if not sample_rows:
243
+ return transformations
244
+
245
+ # Check each string column
246
+ for col_name in spark_df.columns:
247
+ # Get sample values for this column
248
+ sample_values = []
249
+ for row in sample_rows:
250
+ try:
251
+ val = row[col_name]
252
+ if val is not None:
253
+ sample_values.append(str(val))
254
+ except Exception:
255
+ pass
256
+
257
+ # Try to match a transformation pattern
258
+ match = ValueTransformationRegistry.match_pattern(sample_values)
259
+ if match:
260
+ transformations[col_name] = match
261
+
262
+ return transformations
263
+
264
+ def _apply_transformations(self, spark_df, transformations: Dict[str, Tuple[str, str]]):
265
+ """Apply Spark SQL transformations to columns."""
266
+ from pyspark.sql import functions as F
267
+
268
+ if not transformations:
269
+ return spark_df
270
+
271
+ # Build select expressions
272
+ select_exprs = []
273
+ for col_name in spark_df.columns:
274
+ if col_name in transformations:
275
+ target_type, transform_expr = transformations[col_name]
276
+ # Replace {col} placeholder with actual column reference
277
+ expr = transform_expr.replace("{col}", f"`{col_name}`")
278
+ select_exprs.append(F.expr(expr).alias(col_name))
279
+ else:
280
+ select_exprs.append(F.col(f"`{col_name}`"))
281
+
282
+ return spark_df.select(*select_exprs)
283
+
284
+ # File-based database types that need native connection (not JDBC)
285
+ FILE_BASED_ADAPTERS = {'duckdb', 'sqlite'}
286
+
287
+ def _write_to_file_database(self, spark_df, adapter) -> int:
288
+ """Write DataFrame to file-based database using native connection.
289
+
290
+ File-based databases (DuckDB, SQLite) don't handle JDBC writes well
291
+ due to file locking issues. We use native connections via Pandas/Arrow.
292
+
293
+ Args:
294
+ spark_df: Spark DataFrame to write
295
+ adapter: The dbt adapter (used for credentials and relation naming)
296
+
297
+ Returns:
298
+ Row count written
299
+ """
300
+ adapter_type = adapter.type()
301
+ credentials = adapter.config.credentials
302
+
303
+ # Convert Spark DataFrame to Pandas
304
+ pdf = spark_df.toPandas()
305
+ row_count = len(pdf)
306
+
307
+ if adapter_type == 'duckdb':
308
+ # Get DuckDB path from credentials
309
+ db_path = getattr(credentials, 'path', None) or getattr(credentials, 'database', None)
310
+ if not db_path:
311
+ raise ValueError("DuckDB path not found in credentials")
312
+
313
+ # Expand user path
314
+ db_path = str(Path(db_path).expanduser())
315
+
316
+ # Get schema and table name
317
+ schema = self.node.schema or 'main'
318
+ table_name = self.node.alias or self.node.name
319
+
320
+ # Connect and write using DuckDB's native connection
321
+ conn = duckdb.connect(db_path)
322
+ try:
323
+ # Create schema if needed
324
+ conn.execute(f"CREATE SCHEMA IF NOT EXISTS {schema}")
325
+ # Write DataFrame (replace if exists)
326
+ full_table = f"{schema}.{table_name}"
327
+ conn.execute(f"DROP TABLE IF EXISTS {full_table}")
328
+ # Register the Pandas DataFrame as a virtual table, then create from it
329
+ conn.register('_dvt_seed_data', pdf)
330
+ conn.execute(f"CREATE TABLE {full_table} AS SELECT * FROM _dvt_seed_data")
331
+ conn.unregister('_dvt_seed_data')
332
+ conn.commit()
333
+ finally:
334
+ conn.close()
335
+
336
+ elif adapter_type == 'sqlite':
337
+ import sqlite3
338
+
339
+ db_path = getattr(credentials, 'database', None)
340
+ if not db_path:
341
+ raise ValueError("SQLite database path not found in credentials")
342
+
343
+ db_path = str(Path(db_path).expanduser())
344
+ table_name = self.node.alias or self.node.name
345
+
346
+ conn = sqlite3.connect(db_path)
347
+ try:
348
+ pdf.to_sql(table_name, conn, if_exists='replace', index=False)
349
+ finally:
350
+ conn.close()
351
+
352
+ return row_count
353
+
354
+ def _drop_table_cascade(self, adapter, relation) -> None:
355
+ """Drop a table/view with CASCADE to handle dependent objects.
356
+
357
+ Network databases like PostgreSQL may have views depending on tables.
358
+ We need to drop with CASCADE before overwriting.
359
+
360
+ Args:
361
+ adapter: The dbt adapter
362
+ relation: The relation to drop
363
+ """
364
+ adapter_type = adapter.type()
365
+ target_table = relation.render()
366
+
367
+ # Get a raw connection from the adapter
368
+ with adapter.connection_named('drop_cascade'):
369
+ conn = adapter.connections.get_thread_connection()
370
+ if conn and conn.handle:
371
+ try:
372
+ # Use raw connection to execute DROP CASCADE
373
+ cursor = conn.handle.cursor()
374
+ # Try both TABLE and VIEW
375
+ for obj_type in ['TABLE', 'VIEW']:
376
+ try:
377
+ drop_sql = f"DROP {obj_type} IF EXISTS {target_table} CASCADE"
378
+ cursor.execute(drop_sql)
379
+ except Exception:
380
+ pass
381
+ conn.handle.commit()
382
+ except Exception:
383
+ # Ignore errors - table may not exist
384
+ pass
385
+
386
+ def _write_to_target(self, spark_df, adapter) -> int:
387
+ """Write DataFrame to target database using the appropriate method.
388
+
389
+ DVT v0.59.0a47: Hybrid approach:
390
+ - File-based databases (DuckDB): Native connection via Pandas/Arrow
391
+ - Network databases (Postgres, etc.): Spark JDBC with DROP CASCADE
392
+
393
+ Args:
394
+ spark_df: Spark DataFrame to write
395
+ adapter: The dbt adapter (used for credentials and relation naming)
396
+
397
+ Returns:
398
+ Row count written
399
+ """
400
+ adapter_type = adapter.type()
401
+
402
+ # Check if this is a file-based database
403
+ if adapter_type in self.FILE_BASED_ADAPTERS:
404
+ return self._write_to_file_database(spark_df, adapter)
405
+
406
+ # Network database - use Spark JDBC
407
+ from dbt.compute.jdbc_utils import build_jdbc_config
408
+
409
+ credentials = adapter.config.credentials
410
+
411
+ # Use adapter's Relation class for DROP CASCADE (needs proper quoting)
412
+ relation = adapter.Relation.create_from(self.config, self.node)
413
+
414
+ # Drop with CASCADE before write (handles dependent views)
415
+ self._drop_table_cascade(adapter, relation)
416
+
417
+ # Build JDBC config using DVT's unified infrastructure
418
+ jdbc_url, jdbc_properties = build_jdbc_config(credentials)
419
+
420
+ # DVT v0.59.0a48: Build table name WITHOUT adapter quoting for Spark JDBC
421
+ # Spark JDBC expects unquoted table names (it handles quoting internally)
422
+ # relation.render() returns quoted names like "public"."table" which breaks JDBC
423
+ schema = self.node.schema or 'public'
424
+ table_name = self.node.alias or self.node.name
425
+ target_table = f"{schema}.{table_name}"
426
+
427
+ # Count rows before write
428
+ row_count = spark_df.count()
429
+
430
+ # Write to target via Spark JDBC
431
+ spark_df.write \
432
+ .mode("overwrite") \
433
+ .jdbc(jdbc_url, target_table, properties=jdbc_properties)
434
+
435
+ return row_count
436
+
437
+ # Adapters that require strict column name sanitization
438
+ # These don't support special characters (spaces, semicolons, etc.) in column names
439
+ # even with quoting. Most traditional databases (PostgreSQL, MySQL, etc.) support
440
+ # quoted identifiers with special characters.
441
+ STRICT_COLUMN_NAME_ADAPTERS = {
442
+ 'databricks', # Delta tables reject special chars
443
+ 'spark', # Hive metastore has restrictions
444
+ 'delta', # Delta Lake format
445
+ 'iceberg', # Apache Iceberg format
446
+ 'hudi', # Apache Hudi format
447
+ }
448
+
449
+ def _sanitize_column_names(self, spark_df, adapter_type: str):
450
+ """Sanitize column names based on adapter requirements from MDM.
451
+
452
+ DVT v0.59.0a48: MDM-driven column name sanitization.
453
+ Reads syntax_registry from MDM to determine if adapter requires strict identifiers.
454
+
455
+ For strict adapters (determined by MDM syntax_registry):
456
+ 1. Strips leading/trailing whitespace
457
+ 2. Replaces spaces with underscores
458
+ 3. Removes problematic characters: ,;{}()\n\t=
459
+ 4. Ensures SQL-safe identifiers
460
+
461
+ For permissive adapters:
462
+ Preserves column names exactly as they appear in source.
463
+
464
+ Args:
465
+ spark_df: Spark DataFrame to sanitize
466
+ adapter_type: The adapter type (e.g., 'postgres', 'databricks')
467
+
468
+ Returns:
469
+ Spark DataFrame with sanitized column names (if needed)
470
+ """
471
+ # Check MDM for adapter's strict identifier requirement
472
+ requires_strict = self._check_strict_identifiers_from_mdm(adapter_type)
473
+
474
+ if requires_strict:
475
+ # Full sanitization for strict adapters
476
+ new_columns = []
477
+ for col in spark_df.columns:
478
+ new_col = col.strip()
479
+ new_col = new_col.replace(" ", "_")
480
+ new_col = re.sub(r'[,;{}()\n\t=]', '', new_col)
481
+ new_col = re.sub(r'[^\w]', '_', new_col)
482
+ new_col = re.sub(r'_+', '_', new_col)
483
+ new_col = new_col.strip('_')
484
+ if not new_col:
485
+ new_col = f"col_{spark_df.columns.index(col)}"
486
+ new_columns.append(new_col)
487
+ return spark_df.toDF(*new_columns)
488
+ else:
489
+ # Permissive adapters: preserve column names exactly as-is
490
+ return spark_df
491
+
492
+ def _check_strict_identifiers_from_mdm(self, adapter_type: str) -> bool:
493
+ """Check MDM syntax_registry for adapter's strict identifier requirement.
494
+
495
+ DVT v0.59.0a48: Uses MDM to determine sanitization behavior.
496
+ Adapters with backtick quoting (`) typically require strict identifiers
497
+ because they're often data lake formats (Databricks, Spark, BigQuery).
498
+
499
+ Args:
500
+ adapter_type: The adapter type (e.g., 'postgres', 'databricks')
501
+
502
+ Returns:
503
+ True if adapter requires strict column name sanitization
504
+ """
505
+ from pathlib import Path
506
+
507
+ # MDM locations
508
+ registry_paths = [
509
+ Path(__file__).parent.parent / "include" / "data" / "adapters_registry.duckdb",
510
+ Path.home() / ".dvt" / ".data" / "mdm.duckdb",
511
+ ]
512
+
513
+ for path in registry_paths:
514
+ if path.exists():
515
+ try:
516
+ conn = duckdb.connect(str(path), read_only=True)
517
+ # Check if requires_strict_identifiers column exists
518
+ cols = conn.execute(
519
+ "SELECT column_name FROM information_schema.columns "
520
+ "WHERE table_name='syntax_registry' AND column_name='requires_strict_identifiers'"
521
+ ).fetchall()
522
+
523
+ if cols:
524
+ # Use explicit column if it exists
525
+ result = conn.execute(
526
+ "SELECT requires_strict_identifiers FROM syntax_registry "
527
+ f"WHERE adapter_name = '{adapter_type}'"
528
+ ).fetchone()
529
+ conn.close()
530
+ if result:
531
+ return bool(result[0])
532
+ else:
533
+ # Fallback: infer from quoting style
534
+ # Backtick (`) adapters typically require strict identifiers
535
+ result = conn.execute(
536
+ "SELECT quote_start FROM syntax_registry "
537
+ f"WHERE adapter_name = '{adapter_type}'"
538
+ ).fetchone()
539
+ conn.close()
540
+ if result and result[0] == '`':
541
+ return True
542
+ break
543
+ except Exception:
544
+ pass
545
+
546
+ # Default fallback for known strict adapters (if MDM lookup fails)
547
+ return adapter_type in self.STRICT_COLUMN_NAME_ADAPTERS
548
+
549
+ def execute(self, model, manifest):
550
+ """Execute seed loading with Spark and pattern transformations."""
551
+ start_time = time.time()
552
+
553
+ try:
554
+ # Get seed file path
555
+ seed_path = self._get_seed_path()
556
+
557
+ # Read CSV with Spark
558
+ spark = self._get_spark_session()
559
+ spark_df = spark.read \
560
+ .option("header", "true") \
561
+ .option("inferSchema", "false") \
562
+ .csv(str(seed_path))
563
+
564
+ # Get adapter for writing (needed for adapter-specific sanitization)
565
+ adapter = get_adapter(self.config)
566
+ adapter_type = adapter.type()
567
+
568
+ # Sanitize column names based on adapter requirements
569
+ # Strict adapters (Databricks) need full sanitization
570
+ # Permissive adapters (PostgreSQL) only need whitespace trimming
571
+ spark_df = self._sanitize_column_names(spark_df, adapter_type)
572
+
573
+ # Detect and apply transformations
574
+ transformations = self._detect_transformations(spark_df)
575
+ spark_df = self._apply_transformations(spark_df, transformations)
576
+
577
+ # Write to target using unified JDBC
578
+ row_count = self._write_to_target(spark_df, adapter)
579
+
580
+ execution_time = time.time() - start_time
581
+
582
+ # Build result
583
+ return RunResult(
584
+ status=RunStatus.Success,
585
+ timing=[],
586
+ thread_id="",
587
+ execution_time=execution_time,
588
+ adapter_response={},
589
+ message=f"INSERT {row_count}",
590
+ failures=None,
591
+ node=model,
592
+ )
593
+
594
+ except Exception as e:
595
+ execution_time = time.time() - start_time
596
+ return RunResult(
597
+ status=RunStatus.Error,
598
+ timing=[],
599
+ thread_id="",
600
+ execution_time=execution_time,
601
+ adapter_response={},
602
+ message=str(e),
603
+ failures=1,
604
+ node=model,
605
+ )
606
+
607
+ def compile(self, manifest):
608
+ return self.node
609
+
610
+ def print_result_line(self, result):
611
+ group = group_lookup.get(self.node.unique_id)
612
+ level = EventLevel.ERROR if result.status == NodeStatus.Error else EventLevel.INFO
613
+ fire_event(
614
+ LogSeedResult(
615
+ status=result.status,
616
+ result_message=result.message,
617
+ index=self.node_index,
618
+ total=self.num_nodes,
619
+ execution_time=result.execution_time,
620
+ schema=self.node.schema,
621
+ relation=self.node.alias,
622
+ node_info=self.node.node_info,
623
+ group=group,
624
+ ),
625
+ level=level,
626
+ )
627
+
628
+
629
+ class DVTSeedTask(RunTask):
630
+ """DVT Seed Task - Enhanced seed loading with Spark and multi-bar Rich UI.
631
+
632
+ DVT v0.59.0a37: Header displays BEFORE execution via before_run() hook.
633
+ - File-based databases: NATIVE (DuckDB, SQLite)
634
+ - Network databases: SPARK-JDBC (Postgres, Snowflake, etc.)
635
+ """
636
+
637
+ # File-based databases use native connections (not JDBC)
638
+ FILE_BASED_ADAPTERS = {'duckdb', 'sqlite'}
639
+
640
+ def __init__(self, args, config, manifest):
641
+ super().__init__(args, config, manifest)
642
+ self._display: Optional[DVTMultiBarDisplay] = None
643
+ self._adapter_type = None
644
+ self._use_rich_output = HAS_RICH and not getattr(args, 'QUIET', False)
645
+ self._spark_logger = None
646
+ self._header_displayed = False
647
+
648
+ def raise_on_first_error(self) -> bool:
649
+ return False
650
+
651
+ def get_node_selector(self):
652
+ if self.manifest is None or self.graph is None:
653
+ raise DbtInternalError("manifest and graph must be set to perform node selection")
654
+ return ResourceTypeSelector(
655
+ graph=self.graph,
656
+ manifest=self.manifest,
657
+ previous_state=self.previous_state,
658
+ resource_types=[NodeType.Seed],
659
+ )
660
+
661
+ def get_runner_type(self, _):
662
+ return DVTSeedRunner
663
+
664
+ def _get_execution_path(self) -> str:
665
+ """Determine execution path based on adapter type."""
666
+ if self._adapter_type is None:
667
+ try:
668
+ if self.config is None:
669
+ raise ValueError("config is None")
670
+ credentials = self.config.credentials
671
+ if credentials is None:
672
+ raise ValueError("credentials is None")
673
+ self._adapter_type = getattr(credentials, 'type', None)
674
+ if not self._adapter_type:
675
+ adapter = get_adapter(self.config)
676
+ self._adapter_type = adapter.type()
677
+ except Exception:
678
+ self._adapter_type = 'unknown'
679
+
680
+ if self._adapter_type in self.FILE_BASED_ADAPTERS:
681
+ return "NATIVE"
682
+ return "SPARK-JDBC"
683
+
684
+ def _get_target_info(self) -> str:
685
+ """Get the current target name for display."""
686
+ cli_target = getattr(self.config.args, 'TARGET', None)
687
+ return cli_target or self.config.target_name or "default"
688
+
689
+ def _get_compute_info(self) -> str:
690
+ """Get the current compute engine for display."""
691
+ exec_path = self._get_execution_path()
692
+ if exec_path == "NATIVE":
693
+ return "native"
694
+ cli_compute = getattr(self.config.args, 'TARGET_COMPUTE', None)
695
+ return cli_compute or "spark-local"
696
+
697
+ def _start_spark_logger(self) -> None:
698
+ """Start Spark output logging to target directory.
699
+
700
+ Note: suppress_console=False so dbt's event output flows normally.
701
+ Spark output is tee'd to the log file for later reference.
702
+ """
703
+ import os
704
+ try:
705
+ from dbt.compute.spark_logger import get_spark_logger
706
+ target_dir = os.path.join(os.getcwd(), "target")
707
+ compute_name = self._get_compute_info().replace("-", "_")
708
+ self._spark_logger = get_spark_logger(target_dir, compute_name)
709
+ # Don't suppress console - let dbt events flow normally
710
+ self._spark_logger.start_session(suppress_console=False)
711
+ except Exception:
712
+ self._spark_logger = None
713
+
714
+ def _stop_spark_logger(self) -> None:
715
+ """Stop Spark output logging."""
716
+ if self._spark_logger:
717
+ try:
718
+ self._spark_logger.end_session()
719
+ except Exception:
720
+ pass
721
+ self._spark_logger = None
722
+
723
+ def before_run(self, adapter, selected_uids):
724
+ """
725
+ Called BEFORE model execution starts - this is where we show the header.
726
+
727
+ DVT v0.59.0a38: Fixed header timing to display BEFORE execution.
728
+ The before_run() hook is called after 'Concurrency: X threads' message
729
+ but before any models start executing.
730
+ """
731
+ result = super().before_run(adapter, selected_uids)
732
+
733
+ # Show header BEFORE execution (only once)
734
+ if self._use_rich_output and not self._header_displayed:
735
+ try:
736
+ exec_path = self._get_execution_path()
737
+ self._display = DVTMultiBarDisplay(
738
+ title="DVT Seed",
739
+ operation="seed",
740
+ target=self._get_target_info(),
741
+ compute=self._get_compute_info(),
742
+ )
743
+ self._display.start_display()
744
+ self._header_displayed = True
745
+
746
+ # Start Spark logging AFTER header is shown
747
+ if exec_path != "NATIVE":
748
+ self._start_spark_logger()
749
+ except Exception:
750
+ self._display = None
751
+
752
+ return result
753
+
754
+ def run(self):
755
+ """Override run to add Rich UI summary AFTER execution.
756
+
757
+ DVT v0.59.0a38: Header is now displayed in before_run() hook.
758
+ This method only handles summary display after execution completes.
759
+ """
760
+ # Run the parent implementation
761
+ # Header is shown in before_run(), Spark logger started there too
762
+ results = super().run()
763
+
764
+ # Stop Spark logging FIRST so we can print to console
765
+ self._stop_spark_logger()
766
+
767
+ # Show summary AFTER execution
768
+ exec_path = self._get_execution_path()
769
+ if results and self._display:
770
+ try:
771
+ # Update stats from results
772
+ for result in results:
773
+ if result.node:
774
+ duration_ms = (result.execution_time or 0) * 1000
775
+
776
+ if result.status == RunStatus.Success:
777
+ status = "pass"
778
+ error_msg = None
779
+ elif result.status == RunStatus.Error:
780
+ status = "error"
781
+ error_msg = result.message
782
+ else:
783
+ status = "skip"
784
+ error_msg = None
785
+
786
+ self._display.update_model_complete(
787
+ unique_id=result.node.unique_id,
788
+ status=status,
789
+ duration_ms=duration_ms,
790
+ execution_path=exec_path,
791
+ error_message=error_msg,
792
+ )
793
+
794
+ self._display.stop_display()
795
+ self._display.print_summary()
796
+
797
+ except Exception:
798
+ pass
799
+
800
+ return results
801
+
802
+ def task_end_messages(self, results) -> None:
803
+ # Rich UI handles the summary, so we skip the default messages
804
+ if self._display:
805
+ return
806
+ super().task_end_messages(results)