dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2403 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
  74. dbt/compute/engines/spark_engine.py +642 -0
  75. dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
  76. dbt/compute/federated_executor.py +1080 -0
  77. dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
  78. dbt/compute/filter_pushdown.py +273 -0
  79. dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
  80. dbt/compute/jar_provisioning.py +255 -0
  81. dbt/compute/java_compat.cpython-311-darwin.so +0 -0
  82. dbt/compute/java_compat.py +689 -0
  83. dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
  84. dbt/compute/jdbc_utils.py +678 -0
  85. dbt/compute/metadata/__init__.py +40 -0
  86. dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
  87. dbt/compute/metadata/adapters_registry.py +370 -0
  88. dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
  89. dbt/compute/metadata/registry.py +674 -0
  90. dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
  91. dbt/compute/metadata/store.py +1499 -0
  92. dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
  93. dbt/compute/smart_selector.py +377 -0
  94. dbt/compute/strategies/__init__.py +55 -0
  95. dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
  96. dbt/compute/strategies/base.py +165 -0
  97. dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
  98. dbt/compute/strategies/dataproc.py +207 -0
  99. dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
  100. dbt/compute/strategies/emr.py +203 -0
  101. dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
  102. dbt/compute/strategies/local.py +443 -0
  103. dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
  104. dbt/compute/strategies/standalone.py +262 -0
  105. dbt/config/__init__.py +4 -0
  106. dbt/config/catalogs.py +94 -0
  107. dbt/config/compute.cpython-311-darwin.so +0 -0
  108. dbt/config/compute.py +513 -0
  109. dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
  110. dbt/config/dvt_profile.py +342 -0
  111. dbt/config/profile.py +422 -0
  112. dbt/config/project.py +873 -0
  113. dbt/config/project_utils.py +28 -0
  114. dbt/config/renderer.py +231 -0
  115. dbt/config/runtime.py +553 -0
  116. dbt/config/selectors.py +208 -0
  117. dbt/config/utils.py +77 -0
  118. dbt/constants.py +28 -0
  119. dbt/context/__init__.py +0 -0
  120. dbt/context/base.py +745 -0
  121. dbt/context/configured.py +135 -0
  122. dbt/context/context_config.py +382 -0
  123. dbt/context/docs.py +82 -0
  124. dbt/context/exceptions_jinja.py +178 -0
  125. dbt/context/macro_resolver.py +195 -0
  126. dbt/context/macros.py +171 -0
  127. dbt/context/manifest.py +72 -0
  128. dbt/context/providers.py +2249 -0
  129. dbt/context/query_header.py +13 -0
  130. dbt/context/secret.py +58 -0
  131. dbt/context/target.py +74 -0
  132. dbt/contracts/__init__.py +0 -0
  133. dbt/contracts/files.py +413 -0
  134. dbt/contracts/graph/__init__.py +0 -0
  135. dbt/contracts/graph/manifest.py +1904 -0
  136. dbt/contracts/graph/metrics.py +97 -0
  137. dbt/contracts/graph/model_config.py +70 -0
  138. dbt/contracts/graph/node_args.py +42 -0
  139. dbt/contracts/graph/nodes.py +1806 -0
  140. dbt/contracts/graph/semantic_manifest.py +232 -0
  141. dbt/contracts/graph/unparsed.py +811 -0
  142. dbt/contracts/project.py +417 -0
  143. dbt/contracts/results.py +53 -0
  144. dbt/contracts/selection.py +23 -0
  145. dbt/contracts/sql.py +85 -0
  146. dbt/contracts/state.py +68 -0
  147. dbt/contracts/util.py +46 -0
  148. dbt/deprecations.py +348 -0
  149. dbt/deps/__init__.py +0 -0
  150. dbt/deps/base.py +152 -0
  151. dbt/deps/git.py +195 -0
  152. dbt/deps/local.py +79 -0
  153. dbt/deps/registry.py +130 -0
  154. dbt/deps/resolver.py +149 -0
  155. dbt/deps/tarball.py +120 -0
  156. dbt/docs/source/_ext/dbt_click.py +119 -0
  157. dbt/docs/source/conf.py +32 -0
  158. dbt/env_vars.py +64 -0
  159. dbt/event_time/event_time.py +40 -0
  160. dbt/event_time/sample_window.py +60 -0
  161. dbt/events/__init__.py +15 -0
  162. dbt/events/base_types.py +36 -0
  163. dbt/events/core_types_pb2.py +2 -0
  164. dbt/events/logging.py +108 -0
  165. dbt/events/types.py +2516 -0
  166. dbt/exceptions.py +1486 -0
  167. dbt/flags.py +89 -0
  168. dbt/graph/__init__.py +11 -0
  169. dbt/graph/cli.py +249 -0
  170. dbt/graph/graph.py +172 -0
  171. dbt/graph/queue.py +214 -0
  172. dbt/graph/selector.py +374 -0
  173. dbt/graph/selector_methods.py +975 -0
  174. dbt/graph/selector_spec.py +222 -0
  175. dbt/graph/thread_pool.py +18 -0
  176. dbt/hooks.py +21 -0
  177. dbt/include/README.md +49 -0
  178. dbt/include/__init__.py +3 -0
  179. dbt/include/data/adapters_registry.duckdb +0 -0
  180. dbt/include/data/build_registry.py +242 -0
  181. dbt/include/data/csv/adapter_queries.csv +33 -0
  182. dbt/include/data/csv/syntax_rules.csv +9 -0
  183. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  184. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  185. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  186. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  187. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  188. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  189. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  190. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  191. dbt/include/starter_project/.gitignore +4 -0
  192. dbt/include/starter_project/README.md +15 -0
  193. dbt/include/starter_project/__init__.py +3 -0
  194. dbt/include/starter_project/analyses/.gitkeep +0 -0
  195. dbt/include/starter_project/dbt_project.yml +36 -0
  196. dbt/include/starter_project/macros/.gitkeep +0 -0
  197. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  198. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  199. dbt/include/starter_project/models/example/schema.yml +21 -0
  200. dbt/include/starter_project/seeds/.gitkeep +0 -0
  201. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  202. dbt/include/starter_project/tests/.gitkeep +0 -0
  203. dbt/internal_deprecations.py +26 -0
  204. dbt/jsonschemas/__init__.py +3 -0
  205. dbt/jsonschemas/jsonschemas.py +309 -0
  206. dbt/jsonschemas/project/0.0.110.json +4717 -0
  207. dbt/jsonschemas/project/0.0.85.json +2015 -0
  208. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  209. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  210. dbt/jsonschemas/resources/latest.json +6773 -0
  211. dbt/links.py +4 -0
  212. dbt/materializations/__init__.py +0 -0
  213. dbt/materializations/incremental/__init__.py +0 -0
  214. dbt/materializations/incremental/microbatch.py +236 -0
  215. dbt/mp_context.py +8 -0
  216. dbt/node_types.py +37 -0
  217. dbt/parser/__init__.py +23 -0
  218. dbt/parser/analysis.py +21 -0
  219. dbt/parser/base.py +548 -0
  220. dbt/parser/common.py +266 -0
  221. dbt/parser/docs.py +52 -0
  222. dbt/parser/fixtures.py +51 -0
  223. dbt/parser/functions.py +30 -0
  224. dbt/parser/generic_test.py +100 -0
  225. dbt/parser/generic_test_builders.py +333 -0
  226. dbt/parser/hooks.py +118 -0
  227. dbt/parser/macros.py +137 -0
  228. dbt/parser/manifest.py +2204 -0
  229. dbt/parser/models.py +573 -0
  230. dbt/parser/partial.py +1178 -0
  231. dbt/parser/read_files.py +445 -0
  232. dbt/parser/schema_generic_tests.py +422 -0
  233. dbt/parser/schema_renderer.py +111 -0
  234. dbt/parser/schema_yaml_readers.py +935 -0
  235. dbt/parser/schemas.py +1466 -0
  236. dbt/parser/search.py +149 -0
  237. dbt/parser/seeds.py +28 -0
  238. dbt/parser/singular_test.py +20 -0
  239. dbt/parser/snapshots.py +44 -0
  240. dbt/parser/sources.py +558 -0
  241. dbt/parser/sql.py +62 -0
  242. dbt/parser/unit_tests.py +621 -0
  243. dbt/plugins/__init__.py +20 -0
  244. dbt/plugins/contracts.py +9 -0
  245. dbt/plugins/exceptions.py +2 -0
  246. dbt/plugins/manager.py +163 -0
  247. dbt/plugins/manifest.py +21 -0
  248. dbt/profiler.py +20 -0
  249. dbt/py.typed +1 -0
  250. dbt/query_analyzer.cpython-311-darwin.so +0 -0
  251. dbt/query_analyzer.py +410 -0
  252. dbt/runners/__init__.py +2 -0
  253. dbt/runners/exposure_runner.py +7 -0
  254. dbt/runners/no_op_runner.py +45 -0
  255. dbt/runners/saved_query_runner.py +7 -0
  256. dbt/selected_resources.py +8 -0
  257. dbt/task/__init__.py +0 -0
  258. dbt/task/base.py +503 -0
  259. dbt/task/build.py +197 -0
  260. dbt/task/clean.py +56 -0
  261. dbt/task/clone.py +161 -0
  262. dbt/task/compile.py +150 -0
  263. dbt/task/compute.cpython-311-darwin.so +0 -0
  264. dbt/task/compute.py +458 -0
  265. dbt/task/debug.py +505 -0
  266. dbt/task/deps.py +280 -0
  267. dbt/task/docs/__init__.py +3 -0
  268. dbt/task/docs/api/__init__.py +23 -0
  269. dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
  270. dbt/task/docs/api/catalog.py +204 -0
  271. dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
  272. dbt/task/docs/api/lineage.py +234 -0
  273. dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
  274. dbt/task/docs/api/profile.py +204 -0
  275. dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
  276. dbt/task/docs/api/spark.py +186 -0
  277. dbt/task/docs/generate.py +947 -0
  278. dbt/task/docs/index.html +250 -0
  279. dbt/task/docs/serve.cpython-311-darwin.so +0 -0
  280. dbt/task/docs/serve.py +174 -0
  281. dbt/task/dvt_output.py +362 -0
  282. dbt/task/dvt_run.py +204 -0
  283. dbt/task/freshness.py +322 -0
  284. dbt/task/function.py +121 -0
  285. dbt/task/group_lookup.py +46 -0
  286. dbt/task/init.cpython-311-darwin.so +0 -0
  287. dbt/task/init.py +604 -0
  288. dbt/task/java.cpython-311-darwin.so +0 -0
  289. dbt/task/java.py +316 -0
  290. dbt/task/list.py +236 -0
  291. dbt/task/metadata.cpython-311-darwin.so +0 -0
  292. dbt/task/metadata.py +804 -0
  293. dbt/task/printer.py +175 -0
  294. dbt/task/profile.cpython-311-darwin.so +0 -0
  295. dbt/task/profile.py +1307 -0
  296. dbt/task/profile_serve.py +615 -0
  297. dbt/task/retract.py +438 -0
  298. dbt/task/retry.py +175 -0
  299. dbt/task/run.py +1387 -0
  300. dbt/task/run_operation.py +141 -0
  301. dbt/task/runnable.py +758 -0
  302. dbt/task/seed.py +103 -0
  303. dbt/task/show.py +149 -0
  304. dbt/task/snapshot.py +56 -0
  305. dbt/task/spark.cpython-311-darwin.so +0 -0
  306. dbt/task/spark.py +414 -0
  307. dbt/task/sql.py +110 -0
  308. dbt/task/target_sync.cpython-311-darwin.so +0 -0
  309. dbt/task/target_sync.py +766 -0
  310. dbt/task/test.py +464 -0
  311. dbt/tests/fixtures/__init__.py +1 -0
  312. dbt/tests/fixtures/project.py +620 -0
  313. dbt/tests/util.py +651 -0
  314. dbt/tracking.py +529 -0
  315. dbt/utils/__init__.py +3 -0
  316. dbt/utils/artifact_upload.py +151 -0
  317. dbt/utils/utils.py +408 -0
  318. dbt/version.py +270 -0
  319. dvt_cli/__init__.py +72 -0
  320. dvt_core-0.58.6.dist-info/METADATA +288 -0
  321. dvt_core-0.58.6.dist-info/RECORD +324 -0
  322. dvt_core-0.58.6.dist-info/WHEEL +5 -0
  323. dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
  324. dvt_core-0.58.6.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1499 @@
1
+ # =============================================================================
2
+ # DVT Project Metadata Store
3
+ # =============================================================================
4
+ # DuckDB-based metadata store for DVT projects.
5
+ #
6
+ # This store contains PROJECT-LEVEL data only:
7
+ # - Column metadata (from dvt snap or federated runs)
8
+ # - Row counts (from dvt snap only, NOT during every run)
9
+ #
10
+ # Static registry data (type mappings, syntax rules, adapter queries) comes
11
+ # from the shipped adapters_registry.duckdb via AdaptersRegistry class.
12
+ #
13
+ # Location: <project>/.dvt/metadata_store.duckdb
14
+ #
15
+ # DVT v0.54.0: Initial implementation
16
+ # DVT v0.55.0: Refactored to separate project metadata from shipped registry
17
+ # =============================================================================
18
+
19
+ import os
20
+ from pathlib import Path
21
+ from typing import Any, Dict, List, Optional, Tuple
22
+ from dataclasses import dataclass
23
+ from datetime import datetime
24
+
25
+ try:
26
+ import duckdb
27
+ HAS_DUCKDB = True
28
+ except ImportError:
29
+ HAS_DUCKDB = False
30
+
31
+ from dbt.compute.metadata.adapters_registry import (
32
+ AdaptersRegistry,
33
+ TypeMapping,
34
+ SyntaxRule,
35
+ get_registry,
36
+ get_spark_type as registry_get_spark_type,
37
+ get_syntax_rule as registry_get_syntax_rule,
38
+ get_metadata_query as registry_get_metadata_query,
39
+ )
40
+
41
+
42
+ @dataclass
43
+ class ColumnMetadata:
44
+ """Metadata for a single column."""
45
+ column_name: str
46
+ adapter_type: str
47
+ spark_type: str
48
+ is_nullable: bool
49
+ is_primary_key: bool
50
+ ordinal_position: int
51
+
52
+
53
+ @dataclass
54
+ class TableMetadata:
55
+ """Metadata for a table/view (columns only, no row count)."""
56
+ source_name: str
57
+ table_name: str
58
+ adapter_name: str
59
+ connection_name: str
60
+ schema_name: str
61
+ columns: List[ColumnMetadata]
62
+ last_refreshed: datetime
63
+
64
+
65
+ @dataclass
66
+ class RowCountInfo:
67
+ """Row count information for a table."""
68
+ source_name: str
69
+ table_name: str
70
+ row_count: int
71
+ last_refreshed: datetime
72
+
73
+
74
+ # =============================================================================
75
+ # Profile Results (v0.56.0 - dvt profile command)
76
+ # =============================================================================
77
+
78
+ @dataclass
79
+ class ColumnProfileResult:
80
+ """Profile result for a single column."""
81
+ source_name: str
82
+ table_name: str
83
+ column_name: str
84
+ profile_mode: str # 'minimal', 'explorative', 'sensitive', 'time-series'
85
+
86
+ # Basic metrics (all modes)
87
+ row_count: Optional[int] = None
88
+ null_count: Optional[int] = None
89
+ null_percent: Optional[float] = None
90
+ distinct_count: Optional[int] = None
91
+ distinct_percent: Optional[float] = None
92
+
93
+ # Numeric metrics (explorative+)
94
+ min_value: Optional[float] = None
95
+ max_value: Optional[float] = None
96
+ mean_value: Optional[float] = None
97
+ median_value: Optional[float] = None
98
+ stddev_value: Optional[float] = None
99
+ p25: Optional[float] = None
100
+ p50: Optional[float] = None
101
+ p75: Optional[float] = None
102
+
103
+ # String metrics (explorative+)
104
+ min_length: Optional[int] = None
105
+ max_length: Optional[int] = None
106
+ avg_length: Optional[float] = None
107
+
108
+ # Distribution data (JSON strings)
109
+ histogram: Optional[str] = None # JSON: bucket counts
110
+ top_values: Optional[str] = None # JSON: top N values with counts
111
+
112
+ # Quality alerts (JSON string)
113
+ alerts: Optional[str] = None # JSON: [{type, severity, message}]
114
+
115
+ # Metadata
116
+ profiled_at: Optional[datetime] = None
117
+ duration_ms: Optional[int] = None
118
+
119
+
120
+ # =============================================================================
121
+ # Catalog Nodes (v0.56.0 - dvt docs generate enhancement)
122
+ # =============================================================================
123
+
124
+ @dataclass
125
+ class CatalogNode:
126
+ """Enriched catalog node for dvt docs generate."""
127
+ unique_id: str
128
+ resource_type: str # 'model', 'source', 'test', 'seed', 'snapshot'
129
+ name: str
130
+ schema_name: Optional[str] = None
131
+ database: Optional[str] = None
132
+
133
+ # Connection info
134
+ connection_name: Optional[str] = None
135
+ adapter_type: Optional[str] = None
136
+
137
+ # Documentation
138
+ description: Optional[str] = None
139
+
140
+ # Visual enrichment
141
+ icon_type: Optional[str] = None # 'postgres', 'snowflake', 'spark', etc.
142
+ color_hex: Optional[str] = None # Connection color
143
+
144
+ # Config
145
+ materialized: Optional[str] = None
146
+ tags: Optional[str] = None # JSON array
147
+ meta: Optional[str] = None # JSON object
148
+
149
+ # Columns (JSON array)
150
+ columns: Optional[str] = None
151
+
152
+ # Statistics
153
+ row_count: Optional[int] = None
154
+ bytes_stored: Optional[int] = None
155
+
156
+ # Timestamps
157
+ created_at: Optional[datetime] = None
158
+ updated_at: Optional[datetime] = None
159
+
160
+
161
+ # =============================================================================
162
+ # Lineage Edges (v0.56.0 - dvt docs generate enhancement)
163
+ # =============================================================================
164
+
165
+ @dataclass
166
+ class LineageEdge:
167
+ """Lineage edge representing a dependency between nodes."""
168
+ id: Optional[int] = None
169
+ source_node_id: str = ""
170
+ target_node_id: str = ""
171
+ edge_type: str = "" # 'ref', 'source', 'depends_on'
172
+
173
+ # Cross-connection indicator
174
+ is_cross_connection: bool = False
175
+ source_connection: Optional[str] = None
176
+ target_connection: Optional[str] = None
177
+
178
+
179
+ class ProjectMetadataStore:
180
+ """
181
+ DuckDB-based metadata store for a DVT project.
182
+
183
+ Location: <project_root>/.dvt/metadata_store.duckdb
184
+
185
+ Tables (project-level data only):
186
+ - column_metadata: source_name, table_name, column_name, adapter_type, spark_type, ...
187
+ - row_counts: source_name, table_name, row_count, last_refreshed
188
+
189
+ NOTE: Static registry data (type mappings, syntax rules, adapter queries)
190
+ comes from the shipped adapters_registry.duckdb via AdaptersRegistry class.
191
+ """
192
+
193
+ DVT_DIR = ".dvt"
194
+ METADATA_DB = "metadata_store.duckdb"
195
+
196
+ def __init__(self, project_root: Path):
197
+ """
198
+ Initialize the metadata store.
199
+
200
+ Args:
201
+ project_root: Path to the DVT project root directory
202
+ """
203
+ if not HAS_DUCKDB:
204
+ raise ImportError(
205
+ "DuckDB is required for metadata store. "
206
+ "Install with: pip install duckdb"
207
+ )
208
+
209
+ self.project_root = Path(project_root)
210
+ self.dvt_dir = self.project_root / self.DVT_DIR
211
+ self.db_path = self.dvt_dir / self.METADATA_DB
212
+ self._conn: Optional[duckdb.DuckDBPyConnection] = None
213
+ self._registry: Optional[AdaptersRegistry] = None
214
+
215
+ @property
216
+ def conn(self) -> "duckdb.DuckDBPyConnection":
217
+ """Get or create database connection."""
218
+ if self._conn is None:
219
+ self._conn = duckdb.connect(str(self.db_path))
220
+ return self._conn
221
+
222
+ @property
223
+ def registry(self) -> AdaptersRegistry:
224
+ """Get the shipped adapters registry (singleton)."""
225
+ if self._registry is None:
226
+ self._registry = get_registry()
227
+ return self._registry
228
+
229
+ def close(self) -> None:
230
+ """Close the database connection."""
231
+ if self._conn is not None:
232
+ self._conn.close()
233
+ self._conn = None
234
+
235
+ def __enter__(self) -> "ProjectMetadataStore":
236
+ return self
237
+
238
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
239
+ self.close()
240
+
241
+ # =========================================================================
242
+ # Initialization
243
+ # =========================================================================
244
+
245
+ def initialize(self) -> None:
246
+ """
247
+ Initialize the metadata store.
248
+
249
+ Creates:
250
+ 1. .dvt/ directory if it doesn't exist
251
+ 2. metadata_store.duckdb database
252
+ 3. Schema tables (column_metadata, row_counts)
253
+
254
+ NOTE: No registry data is loaded - that comes from shipped DuckDB.
255
+ """
256
+ # Create .dvt/ directory
257
+ self.dvt_dir.mkdir(parents=True, exist_ok=True)
258
+
259
+ # Create schema tables
260
+ self._create_schema()
261
+
262
+ def _create_schema(self) -> None:
263
+ """Create the database schema tables."""
264
+
265
+ # Column metadata table (populated by dvt snap or federated runs)
266
+ self.conn.execute("""
267
+ CREATE TABLE IF NOT EXISTS column_metadata (
268
+ source_name VARCHAR NOT NULL,
269
+ table_name VARCHAR NOT NULL,
270
+ column_name VARCHAR NOT NULL,
271
+ adapter_name VARCHAR NOT NULL,
272
+ connection_name VARCHAR NOT NULL,
273
+ schema_name VARCHAR,
274
+ adapter_type VARCHAR NOT NULL,
275
+ spark_type VARCHAR NOT NULL,
276
+ is_nullable BOOLEAN DEFAULT TRUE,
277
+ is_primary_key BOOLEAN DEFAULT FALSE,
278
+ ordinal_position INTEGER,
279
+ last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
280
+ PRIMARY KEY(source_name, table_name, column_name)
281
+ )
282
+ """)
283
+
284
+ # Row counts table (ONLY populated by dvt snap, not during runs)
285
+ self.conn.execute("""
286
+ CREATE TABLE IF NOT EXISTS row_counts (
287
+ source_name VARCHAR NOT NULL,
288
+ table_name VARCHAR NOT NULL,
289
+ row_count BIGINT,
290
+ last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
291
+ PRIMARY KEY(source_name, table_name)
292
+ )
293
+ """)
294
+
295
+ # =====================================================================
296
+ # v0.56.0: Profile Results (dvt profile command)
297
+ # =====================================================================
298
+ self.conn.execute("""
299
+ CREATE TABLE IF NOT EXISTS profile_results (
300
+ source_name VARCHAR NOT NULL,
301
+ table_name VARCHAR NOT NULL,
302
+ column_name VARCHAR NOT NULL,
303
+ profile_mode VARCHAR NOT NULL,
304
+
305
+ -- Basic metrics (all modes)
306
+ row_count BIGINT,
307
+ null_count BIGINT,
308
+ null_percent DOUBLE,
309
+ distinct_count BIGINT,
310
+ distinct_percent DOUBLE,
311
+
312
+ -- Numeric metrics (explorative+)
313
+ min_value DOUBLE,
314
+ max_value DOUBLE,
315
+ mean_value DOUBLE,
316
+ median_value DOUBLE,
317
+ stddev_value DOUBLE,
318
+ p25 DOUBLE,
319
+ p50 DOUBLE,
320
+ p75 DOUBLE,
321
+
322
+ -- String metrics (explorative+)
323
+ min_length INTEGER,
324
+ max_length INTEGER,
325
+ avg_length DOUBLE,
326
+
327
+ -- Distribution data (JSON)
328
+ histogram JSON,
329
+ top_values JSON,
330
+
331
+ -- Quality alerts
332
+ alerts JSON,
333
+
334
+ -- Metadata
335
+ profiled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
336
+ duration_ms INTEGER,
337
+
338
+ PRIMARY KEY(source_name, table_name, column_name, profile_mode)
339
+ )
340
+ """)
341
+
342
+ # =====================================================================
343
+ # v0.56.0: Catalog Nodes (dvt docs generate enhancement)
344
+ # =====================================================================
345
+ self.conn.execute("""
346
+ CREATE TABLE IF NOT EXISTS catalog_nodes (
347
+ unique_id VARCHAR PRIMARY KEY,
348
+ resource_type VARCHAR NOT NULL,
349
+ name VARCHAR NOT NULL,
350
+ schema_name VARCHAR,
351
+ database VARCHAR,
352
+
353
+ -- Connection info
354
+ connection_name VARCHAR,
355
+ adapter_type VARCHAR,
356
+
357
+ -- Documentation
358
+ description TEXT,
359
+
360
+ -- Visual enrichment
361
+ icon_type VARCHAR,
362
+ color_hex VARCHAR,
363
+
364
+ -- Config
365
+ materialized VARCHAR,
366
+ tags JSON,
367
+ meta JSON,
368
+
369
+ -- Columns (JSON array)
370
+ columns JSON,
371
+
372
+ -- Statistics
373
+ row_count BIGINT,
374
+ bytes_stored BIGINT,
375
+
376
+ -- Timestamps
377
+ created_at TIMESTAMP,
378
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
379
+ )
380
+ """)
381
+
382
+ # =====================================================================
383
+ # v0.56.0: Lineage Edges (dvt docs generate enhancement)
384
+ # =====================================================================
385
+ self.conn.execute("""
386
+ CREATE TABLE IF NOT EXISTS lineage_edges (
387
+ id INTEGER PRIMARY KEY,
388
+ source_node_id VARCHAR NOT NULL,
389
+ target_node_id VARCHAR NOT NULL,
390
+ edge_type VARCHAR NOT NULL,
391
+
392
+ -- Cross-connection indicator
393
+ is_cross_connection BOOLEAN DEFAULT FALSE,
394
+ source_connection VARCHAR,
395
+ target_connection VARCHAR
396
+ )
397
+ """)
398
+
399
+ # Create indexes for fast lookups
400
+ self.conn.execute("""
401
+ CREATE INDEX IF NOT EXISTS idx_column_metadata_source
402
+ ON column_metadata(source_name, table_name)
403
+ """)
404
+ self.conn.execute("""
405
+ CREATE INDEX IF NOT EXISTS idx_column_metadata_adapter
406
+ ON column_metadata(adapter_name)
407
+ """)
408
+ self.conn.execute("""
409
+ CREATE INDEX IF NOT EXISTS idx_row_counts_source
410
+ ON row_counts(source_name)
411
+ """)
412
+
413
+ # v0.56.0: New indexes for profile, catalog, and lineage
414
+ self.conn.execute("""
415
+ CREATE INDEX IF NOT EXISTS idx_profile_results_table
416
+ ON profile_results(source_name, table_name)
417
+ """)
418
+ self.conn.execute("""
419
+ CREATE INDEX IF NOT EXISTS idx_catalog_nodes_type
420
+ ON catalog_nodes(resource_type)
421
+ """)
422
+ self.conn.execute("""
423
+ CREATE INDEX IF NOT EXISTS idx_lineage_edges_source
424
+ ON lineage_edges(source_node_id)
425
+ """)
426
+ self.conn.execute("""
427
+ CREATE INDEX IF NOT EXISTS idx_lineage_edges_target
428
+ ON lineage_edges(target_node_id)
429
+ """)
430
+
431
+ # =========================================================================
432
+ # Type Registry Queries (delegated to shipped AdaptersRegistry)
433
+ # =========================================================================
434
+
435
+ def get_spark_type(
436
+ self,
437
+ adapter_name: str,
438
+ adapter_type: str,
439
+ spark_version: str = "all"
440
+ ) -> Optional[str]:
441
+ """
442
+ Look up the Spark type for an adapter type.
443
+
444
+ Delegates to the shipped AdaptersRegistry.
445
+
446
+ Args:
447
+ adapter_name: Name of the adapter (e.g., 'postgres', 'snowflake')
448
+ adapter_type: Native adapter type (e.g., 'VARCHAR', 'INTEGER')
449
+ spark_version: Target Spark version (default: 'all')
450
+
451
+ Returns:
452
+ Spark type string or None if not found
453
+ """
454
+ mapping = self.registry.get_spark_type(adapter_name, adapter_type, spark_version)
455
+ return mapping.spark_type if mapping else None
456
+
457
+ def get_type_mappings(
458
+ self,
459
+ adapter_name: str,
460
+ spark_version: str = "all"
461
+ ) -> List[Tuple[str, str]]:
462
+ """
463
+ Get all type mappings for an adapter.
464
+
465
+ Delegates to the shipped AdaptersRegistry.
466
+
467
+ Returns:
468
+ List of (adapter_type, spark_type) tuples
469
+ """
470
+ mappings = self.registry.get_all_mappings_for_adapter(adapter_name)
471
+ return [(m.adapter_type, m.spark_type) for m in mappings]
472
+
473
+ # =========================================================================
474
+ # Syntax Registry Queries (delegated to shipped AdaptersRegistry)
475
+ # =========================================================================
476
+
477
+ def get_syntax_rule(self, adapter_name: str) -> Optional[SyntaxRule]:
478
+ """
479
+ Get syntax rules for an adapter.
480
+
481
+ Delegates to the shipped AdaptersRegistry.
482
+
483
+ Args:
484
+ adapter_name: Name of the adapter
485
+
486
+ Returns:
487
+ SyntaxRule or None if not found
488
+ """
489
+ return self.registry.get_syntax_rule(adapter_name)
490
+
491
+ def quote_identifier(self, adapter_name: str, identifier: str) -> str:
492
+ """Quote an identifier for the given adapter."""
493
+ return self.registry.quote_identifier(adapter_name, identifier)
494
+
495
+ # =========================================================================
496
+ # Adapter Metadata Queries (delegated to shipped AdaptersRegistry)
497
+ # =========================================================================
498
+
499
+ def get_metadata_query(
500
+ self,
501
+ adapter_name: str,
502
+ query_type: str
503
+ ) -> Optional[str]:
504
+ """
505
+ Get the metadata query template for an adapter.
506
+
507
+ Delegates to the shipped AdaptersRegistry.
508
+
509
+ Args:
510
+ adapter_name: Name of the adapter
511
+ query_type: Type of query ('columns', 'tables', 'row_count', 'primary_key')
512
+
513
+ Returns:
514
+ Query template string or None if not found
515
+ """
516
+ query = self.registry.get_metadata_query(adapter_name, query_type)
517
+ return query.query_template if query else None
518
+
519
+ # =========================================================================
520
+ # Column Metadata Operations
521
+ # =========================================================================
522
+
523
+ def save_table_metadata(self, metadata: TableMetadata) -> None:
524
+ """
525
+ Save table column metadata to the store.
526
+
527
+ This is called during federated execution to capture schema info.
528
+
529
+ Args:
530
+ metadata: TableMetadata object with column info
531
+ """
532
+ # Delete existing entries for this table
533
+ self.conn.execute("""
534
+ DELETE FROM column_metadata
535
+ WHERE source_name = ? AND table_name = ?
536
+ """, [metadata.source_name, metadata.table_name])
537
+
538
+ # Insert new entries
539
+ for col in metadata.columns:
540
+ self.conn.execute("""
541
+ INSERT INTO column_metadata
542
+ (source_name, table_name, column_name, adapter_name, connection_name,
543
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
544
+ ordinal_position, last_refreshed)
545
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
546
+ """, [
547
+ metadata.source_name,
548
+ metadata.table_name,
549
+ col.column_name,
550
+ metadata.adapter_name,
551
+ metadata.connection_name,
552
+ metadata.schema_name,
553
+ col.adapter_type,
554
+ col.spark_type,
555
+ col.is_nullable,
556
+ col.is_primary_key,
557
+ col.ordinal_position,
558
+ metadata.last_refreshed
559
+ ])
560
+
561
+ def get_table_metadata(
562
+ self,
563
+ source_name: str,
564
+ table_name: str
565
+ ) -> Optional[TableMetadata]:
566
+ """
567
+ Get cached column metadata for a table.
568
+
569
+ Args:
570
+ source_name: Name of the source
571
+ table_name: Name of the table
572
+
573
+ Returns:
574
+ TableMetadata or None if not cached
575
+ """
576
+ results = self.conn.execute("""
577
+ SELECT
578
+ source_name, table_name, column_name, adapter_name, connection_name,
579
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
580
+ ordinal_position, last_refreshed
581
+ FROM column_metadata
582
+ WHERE source_name = ? AND table_name = ?
583
+ ORDER BY ordinal_position
584
+ """, [source_name, table_name]).fetchall()
585
+
586
+ if not results:
587
+ return None
588
+
589
+ # Build column list
590
+ columns = []
591
+ for r in results:
592
+ columns.append(ColumnMetadata(
593
+ column_name=r[2],
594
+ adapter_type=r[6],
595
+ spark_type=r[7],
596
+ is_nullable=r[8],
597
+ is_primary_key=r[9],
598
+ ordinal_position=r[10]
599
+ ))
600
+
601
+ # Build TableMetadata from first row
602
+ first = results[0]
603
+ return TableMetadata(
604
+ source_name=first[0],
605
+ table_name=first[1],
606
+ adapter_name=first[3],
607
+ connection_name=first[4],
608
+ schema_name=first[5],
609
+ columns=columns,
610
+ last_refreshed=first[11]
611
+ )
612
+
613
+ def get_all_sources(self) -> List[Tuple[str, str]]:
614
+ """
615
+ Get all source/table combinations in the store.
616
+
617
+ Returns:
618
+ List of (source_name, table_name) tuples
619
+ """
620
+ results = self.conn.execute("""
621
+ SELECT DISTINCT source_name, table_name
622
+ FROM column_metadata
623
+ ORDER BY source_name, table_name
624
+ """).fetchall()
625
+
626
+ return [(r[0], r[1]) for r in results]
627
+
628
+ def clear_column_metadata(self) -> None:
629
+ """Clear all column metadata."""
630
+ self.conn.execute("DELETE FROM column_metadata")
631
+
632
+ # =========================================================================
633
+ # Row Count Operations (dvt snap only)
634
+ # =========================================================================
635
+
636
+ def save_row_count(
637
+ self,
638
+ source_name: str,
639
+ table_name: str,
640
+ row_count: int,
641
+ last_refreshed: Optional[datetime] = None
642
+ ) -> None:
643
+ """
644
+ Save row count for a table.
645
+
646
+ This is ONLY called by dvt snap, not during regular runs.
647
+
648
+ Args:
649
+ source_name: Name of the source
650
+ table_name: Name of the table
651
+ row_count: Number of rows
652
+ last_refreshed: Timestamp (defaults to now)
653
+ """
654
+ if last_refreshed is None:
655
+ last_refreshed = datetime.now()
656
+
657
+ self.conn.execute("""
658
+ INSERT OR REPLACE INTO row_counts
659
+ (source_name, table_name, row_count, last_refreshed)
660
+ VALUES (?, ?, ?, ?)
661
+ """, [source_name, table_name, row_count, last_refreshed])
662
+
663
+ def get_row_count(self, source_name: str, table_name: str) -> Optional[RowCountInfo]:
664
+ """
665
+ Get cached row count for a table.
666
+
667
+ Args:
668
+ source_name: Name of the source
669
+ table_name: Name of the table
670
+
671
+ Returns:
672
+ RowCountInfo or None if not cached
673
+ """
674
+ result = self.conn.execute("""
675
+ SELECT source_name, table_name, row_count, last_refreshed
676
+ FROM row_counts
677
+ WHERE source_name = ? AND table_name = ?
678
+ """, [source_name, table_name]).fetchone()
679
+
680
+ if result:
681
+ return RowCountInfo(
682
+ source_name=result[0],
683
+ table_name=result[1],
684
+ row_count=result[2],
685
+ last_refreshed=result[3]
686
+ )
687
+ return None
688
+
689
+ def get_all_row_counts(self) -> List[RowCountInfo]:
690
+ """
691
+ Get all cached row counts.
692
+
693
+ Returns:
694
+ List of RowCountInfo objects
695
+ """
696
+ results = self.conn.execute("""
697
+ SELECT source_name, table_name, row_count, last_refreshed
698
+ FROM row_counts
699
+ ORDER BY source_name, table_name
700
+ """).fetchall()
701
+
702
+ return [
703
+ RowCountInfo(
704
+ source_name=r[0],
705
+ table_name=r[1],
706
+ row_count=r[2],
707
+ last_refreshed=r[3]
708
+ )
709
+ for r in results
710
+ ]
711
+
712
+ def clear_row_counts(self) -> None:
713
+ """Clear all row count data."""
714
+ self.conn.execute("DELETE FROM row_counts")
715
+
716
+ def clear_snapshot(self) -> None:
717
+ """Clear all snapshot data (both column metadata and row counts)."""
718
+ self.clear_column_metadata()
719
+ self.clear_row_counts()
720
+
721
+ def clear_all_metadata(self) -> None:
722
+ """Clear ALL metadata from the store (columns, row counts, profiles)."""
723
+ self.clear_column_metadata()
724
+ self.clear_row_counts()
725
+ self.clear_profile_results()
726
+ # Note: catalog_nodes and lineage_edges are not cleared here
727
+ # as they're managed by dvt docs generate
728
+
729
+ def has_source_metadata(self) -> bool:
730
+ """
731
+ Check if there is any source metadata in the store.
732
+
733
+ Used to determine if this is the first run (auto-snapshot needed).
734
+
735
+ Returns:
736
+ True if source metadata exists, False otherwise
737
+ """
738
+ result = self.conn.execute("""
739
+ SELECT COUNT(*) FROM column_metadata
740
+ WHERE source_name NOT LIKE 'model:%'
741
+ """).fetchone()[0]
742
+ return result > 0
743
+
744
+ def has_any_metadata(self) -> bool:
745
+ """
746
+ Check if there is any metadata (sources or models) in the store.
747
+
748
+ Returns:
749
+ True if any metadata exists, False otherwise
750
+ """
751
+ result = self.conn.execute(
752
+ "SELECT COUNT(*) FROM column_metadata"
753
+ ).fetchone()[0]
754
+ return result > 0
755
+
756
+ # =========================================================================
757
+ # Legacy Compatibility - save_table_metadata with row_count
758
+ # =========================================================================
759
+
760
+ def save_table_metadata_with_row_count(
761
+ self,
762
+ source_name: str,
763
+ table_name: str,
764
+ adapter_name: str,
765
+ connection_name: str,
766
+ schema_name: str,
767
+ columns: List[ColumnMetadata],
768
+ row_count: Optional[int],
769
+ last_refreshed: datetime
770
+ ) -> None:
771
+ """
772
+ Save both column metadata and row count (used by dvt snap).
773
+
774
+ Args:
775
+ source_name: Name of the source
776
+ table_name: Name of the table
777
+ adapter_name: Name of the adapter
778
+ connection_name: Name of the connection
779
+ schema_name: Schema name
780
+ columns: List of ColumnMetadata
781
+ row_count: Number of rows (or None)
782
+ last_refreshed: Timestamp
783
+ """
784
+ # Save column metadata
785
+ metadata = TableMetadata(
786
+ source_name=source_name,
787
+ table_name=table_name,
788
+ adapter_name=adapter_name,
789
+ connection_name=connection_name,
790
+ schema_name=schema_name,
791
+ columns=columns,
792
+ last_refreshed=last_refreshed
793
+ )
794
+ self.save_table_metadata(metadata)
795
+
796
+ # Save row count separately (only if provided)
797
+ if row_count is not None:
798
+ self.save_row_count(source_name, table_name, row_count, last_refreshed)
799
+
800
+ # =========================================================================
801
+ # Utility Methods
802
+ # =========================================================================
803
+
804
+ def exists(self) -> bool:
805
+ """Check if the metadata store exists."""
806
+ return self.db_path.exists()
807
+
808
+ def get_stats(self) -> Dict[str, Any]:
809
+ """Get statistics about the metadata store."""
810
+ # Count column metadata
811
+ tables_count = self.conn.execute(
812
+ "SELECT COUNT(DISTINCT source_name || '.' || table_name) FROM column_metadata"
813
+ ).fetchone()[0]
814
+
815
+ columns_count = self.conn.execute(
816
+ "SELECT COUNT(*) FROM column_metadata"
817
+ ).fetchone()[0]
818
+
819
+ # Count row counts
820
+ row_counts_count = self.conn.execute(
821
+ "SELECT COUNT(*) FROM row_counts"
822
+ ).fetchone()[0]
823
+
824
+ # Get registry stats
825
+ registry = self.registry
826
+ adapters = registry.get_supported_adapters()
827
+
828
+ return {
829
+ "metadata_tables": tables_count,
830
+ "metadata_columns": columns_count,
831
+ "row_counts_cached": row_counts_count,
832
+ "registry_adapters": len(adapters),
833
+ "supported_adapters": adapters,
834
+ "db_path": str(self.db_path),
835
+ }
836
+
837
+ def migrate_from_legacy(self) -> bool:
838
+ """
839
+ Migrate from legacy metadata.duckdb format to new format.
840
+
841
+ Returns:
842
+ True if migration was performed, False if not needed
843
+ """
844
+ legacy_path = self.dvt_dir / "metadata.duckdb"
845
+ if not legacy_path.exists():
846
+ return False
847
+
848
+ # Check if new store already exists
849
+ if self.db_path.exists():
850
+ return False
851
+
852
+ try:
853
+ # Connect to legacy database
854
+ legacy_conn = duckdb.connect(str(legacy_path), read_only=True)
855
+
856
+ # Check if metadata_snapshot table exists
857
+ result = legacy_conn.execute("""
858
+ SELECT COUNT(*) FROM information_schema.tables
859
+ WHERE table_name = 'metadata_snapshot'
860
+ """).fetchone()[0]
861
+
862
+ if result == 0:
863
+ legacy_conn.close()
864
+ return False
865
+
866
+ # Initialize new store
867
+ self.initialize()
868
+
869
+ # Migrate metadata_snapshot to column_metadata
870
+ rows = legacy_conn.execute("""
871
+ SELECT DISTINCT
872
+ source_name, table_name, column_name, adapter_name, connection_name,
873
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
874
+ ordinal_position, last_refreshed
875
+ FROM metadata_snapshot
876
+ """).fetchall()
877
+
878
+ for row in rows:
879
+ self.conn.execute("""
880
+ INSERT OR REPLACE INTO column_metadata
881
+ (source_name, table_name, column_name, adapter_name, connection_name,
882
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
883
+ ordinal_position, last_refreshed)
884
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
885
+ """, list(row))
886
+
887
+ # Migrate row_count data (distinct per table)
888
+ row_count_rows = legacy_conn.execute("""
889
+ SELECT DISTINCT source_name, table_name, row_count, MAX(last_refreshed)
890
+ FROM metadata_snapshot
891
+ WHERE row_count IS NOT NULL
892
+ GROUP BY source_name, table_name, row_count
893
+ """).fetchall()
894
+
895
+ for row in row_count_rows:
896
+ if row[2] is not None: # row_count
897
+ self.conn.execute("""
898
+ INSERT OR REPLACE INTO row_counts
899
+ (source_name, table_name, row_count, last_refreshed)
900
+ VALUES (?, ?, ?, ?)
901
+ """, list(row))
902
+
903
+ legacy_conn.close()
904
+ return True
905
+
906
+ except Exception as e:
907
+ print(f"[DVT] Warning: Migration failed: {e}")
908
+ return False
909
+
910
+ # =========================================================================
911
+ # Profile Results Operations (v0.56.0 - dvt profile command)
912
+ # =========================================================================
913
+
914
+ def save_profile_result(self, result: ColumnProfileResult) -> None:
915
+ """
916
+ Save a column profile result to the store.
917
+
918
+ Args:
919
+ result: ColumnProfileResult object
920
+ """
921
+ self.conn.execute("""
922
+ INSERT OR REPLACE INTO profile_results
923
+ (source_name, table_name, column_name, profile_mode,
924
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
925
+ min_value, max_value, mean_value, median_value, stddev_value,
926
+ p25, p50, p75, min_length, max_length, avg_length,
927
+ histogram, top_values, alerts, profiled_at, duration_ms)
928
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
929
+ """, [
930
+ result.source_name, result.table_name, result.column_name, result.profile_mode,
931
+ result.row_count, result.null_count, result.null_percent,
932
+ result.distinct_count, result.distinct_percent,
933
+ result.min_value, result.max_value, result.mean_value,
934
+ result.median_value, result.stddev_value,
935
+ result.p25, result.p50, result.p75,
936
+ result.min_length, result.max_length, result.avg_length,
937
+ result.histogram, result.top_values, result.alerts,
938
+ result.profiled_at or datetime.now(), result.duration_ms
939
+ ])
940
+
941
+ def save_profile_results_batch(self, results: List[ColumnProfileResult]) -> None:
942
+ """
943
+ Save multiple profile results in a batch.
944
+
945
+ Args:
946
+ results: List of ColumnProfileResult objects
947
+ """
948
+ for result in results:
949
+ self.save_profile_result(result)
950
+
951
+ def get_profile_result(
952
+ self,
953
+ source_name: str,
954
+ table_name: str,
955
+ column_name: str,
956
+ profile_mode: str
957
+ ) -> Optional[ColumnProfileResult]:
958
+ """
959
+ Get a profile result for a specific column.
960
+
961
+ Args:
962
+ source_name: Name of the source
963
+ table_name: Name of the table
964
+ column_name: Name of the column
965
+ profile_mode: Profile mode ('minimal', 'explorative', etc.)
966
+
967
+ Returns:
968
+ ColumnProfileResult or None if not found
969
+ """
970
+ result = self.conn.execute("""
971
+ SELECT source_name, table_name, column_name, profile_mode,
972
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
973
+ min_value, max_value, mean_value, median_value, stddev_value,
974
+ p25, p50, p75, min_length, max_length, avg_length,
975
+ histogram, top_values, alerts, profiled_at, duration_ms
976
+ FROM profile_results
977
+ WHERE source_name = ? AND table_name = ? AND column_name = ? AND profile_mode = ?
978
+ """, [source_name, table_name, column_name, profile_mode]).fetchone()
979
+
980
+ if result:
981
+ return ColumnProfileResult(
982
+ source_name=result[0], table_name=result[1],
983
+ column_name=result[2], profile_mode=result[3],
984
+ row_count=result[4], null_count=result[5], null_percent=result[6],
985
+ distinct_count=result[7], distinct_percent=result[8],
986
+ min_value=result[9], max_value=result[10], mean_value=result[11],
987
+ median_value=result[12], stddev_value=result[13],
988
+ p25=result[14], p50=result[15], p75=result[16],
989
+ min_length=result[17], max_length=result[18], avg_length=result[19],
990
+ histogram=result[20], top_values=result[21], alerts=result[22],
991
+ profiled_at=result[23], duration_ms=result[24]
992
+ )
993
+ return None
994
+
995
+ def get_table_profile(
996
+ self,
997
+ source_name: str,
998
+ table_name: str,
999
+ profile_mode: Optional[str] = None
1000
+ ) -> List[ColumnProfileResult]:
1001
+ """
1002
+ Get all profile results for a table.
1003
+
1004
+ Args:
1005
+ source_name: Name of the source
1006
+ table_name: Name of the table
1007
+ profile_mode: Optional mode filter
1008
+
1009
+ Returns:
1010
+ List of ColumnProfileResult objects
1011
+ """
1012
+ if profile_mode:
1013
+ results = self.conn.execute("""
1014
+ SELECT source_name, table_name, column_name, profile_mode,
1015
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
1016
+ min_value, max_value, mean_value, median_value, stddev_value,
1017
+ p25, p50, p75, min_length, max_length, avg_length,
1018
+ histogram, top_values, alerts, profiled_at, duration_ms
1019
+ FROM profile_results
1020
+ WHERE source_name = ? AND table_name = ? AND profile_mode = ?
1021
+ ORDER BY column_name
1022
+ """, [source_name, table_name, profile_mode]).fetchall()
1023
+ else:
1024
+ results = self.conn.execute("""
1025
+ SELECT source_name, table_name, column_name, profile_mode,
1026
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
1027
+ min_value, max_value, mean_value, median_value, stddev_value,
1028
+ p25, p50, p75, min_length, max_length, avg_length,
1029
+ histogram, top_values, alerts, profiled_at, duration_ms
1030
+ FROM profile_results
1031
+ WHERE source_name = ? AND table_name = ?
1032
+ ORDER BY column_name
1033
+ """, [source_name, table_name]).fetchall()
1034
+
1035
+ return [
1036
+ ColumnProfileResult(
1037
+ source_name=r[0], table_name=r[1], column_name=r[2], profile_mode=r[3],
1038
+ row_count=r[4], null_count=r[5], null_percent=r[6],
1039
+ distinct_count=r[7], distinct_percent=r[8],
1040
+ min_value=r[9], max_value=r[10], mean_value=r[11],
1041
+ median_value=r[12], stddev_value=r[13],
1042
+ p25=r[14], p50=r[15], p75=r[16],
1043
+ min_length=r[17], max_length=r[18], avg_length=r[19],
1044
+ histogram=r[20], top_values=r[21], alerts=r[22],
1045
+ profiled_at=r[23], duration_ms=r[24]
1046
+ )
1047
+ for r in results
1048
+ ]
1049
+
1050
+ def get_all_profiled_tables(self) -> List[Tuple[str, str, str, datetime]]:
1051
+ """
1052
+ Get all profiled tables with their latest profile timestamp.
1053
+
1054
+ Returns:
1055
+ List of (source_name, table_name, profile_mode, profiled_at) tuples
1056
+ """
1057
+ results = self.conn.execute("""
1058
+ SELECT source_name, table_name, profile_mode, MAX(profiled_at) as last_profiled
1059
+ FROM profile_results
1060
+ GROUP BY source_name, table_name, profile_mode
1061
+ ORDER BY source_name, table_name
1062
+ """).fetchall()
1063
+
1064
+ return [(r[0], r[1], r[2], r[3]) for r in results]
1065
+
1066
+ def get_profile_alerts(self, source_name: Optional[str] = None) -> List[Dict[str, Any]]:
1067
+ """
1068
+ Get all profile alerts, optionally filtered by source.
1069
+
1070
+ Args:
1071
+ source_name: Optional source filter
1072
+
1073
+ Returns:
1074
+ List of alert dicts with source/table/column info
1075
+ """
1076
+ import json
1077
+
1078
+ if source_name:
1079
+ results = self.conn.execute("""
1080
+ SELECT source_name, table_name, column_name, alerts
1081
+ FROM profile_results
1082
+ WHERE source_name = ? AND alerts IS NOT NULL
1083
+ """, [source_name]).fetchall()
1084
+ else:
1085
+ results = self.conn.execute("""
1086
+ SELECT source_name, table_name, column_name, alerts
1087
+ FROM profile_results
1088
+ WHERE alerts IS NOT NULL
1089
+ """).fetchall()
1090
+
1091
+ all_alerts = []
1092
+ for r in results:
1093
+ try:
1094
+ alerts = json.loads(r[3]) if r[3] else []
1095
+ for alert in alerts:
1096
+ alert["source_name"] = r[0]
1097
+ alert["table_name"] = r[1]
1098
+ alert["column_name"] = r[2]
1099
+ all_alerts.append(alert)
1100
+ except json.JSONDecodeError:
1101
+ pass
1102
+
1103
+ return all_alerts
1104
+
1105
+ def clear_profile_results(self, source_name: Optional[str] = None) -> None:
1106
+ """
1107
+ Clear profile results, optionally for a specific source.
1108
+
1109
+ Args:
1110
+ source_name: Optional source filter
1111
+ """
1112
+ if source_name:
1113
+ self.conn.execute("DELETE FROM profile_results WHERE source_name = ?", [source_name])
1114
+ else:
1115
+ self.conn.execute("DELETE FROM profile_results")
1116
+
1117
+ # =========================================================================
1118
+ # Catalog Node Operations (v0.56.0 - dvt docs generate enhancement)
1119
+ # =========================================================================
1120
+
1121
+ def save_catalog_node(self, node: CatalogNode) -> None:
1122
+ """
1123
+ Save a catalog node to the store.
1124
+
1125
+ Args:
1126
+ node: CatalogNode object
1127
+ """
1128
+ self.conn.execute("""
1129
+ INSERT OR REPLACE INTO catalog_nodes
1130
+ (unique_id, resource_type, name, schema_name, database,
1131
+ connection_name, adapter_type, description, icon_type, color_hex,
1132
+ materialized, tags, meta, columns, row_count, bytes_stored,
1133
+ created_at, updated_at)
1134
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1135
+ """, [
1136
+ node.unique_id, node.resource_type, node.name,
1137
+ node.schema_name, node.database,
1138
+ node.connection_name, node.adapter_type,
1139
+ node.description, node.icon_type, node.color_hex,
1140
+ node.materialized, node.tags, node.meta, node.columns,
1141
+ node.row_count, node.bytes_stored,
1142
+ node.created_at, node.updated_at or datetime.now()
1143
+ ])
1144
+
1145
+ def save_catalog_nodes_batch(self, nodes: List[CatalogNode]) -> None:
1146
+ """
1147
+ Save multiple catalog nodes in a batch.
1148
+
1149
+ Args:
1150
+ nodes: List of CatalogNode objects
1151
+ """
1152
+ for node in nodes:
1153
+ self.save_catalog_node(node)
1154
+
1155
+ def get_catalog_node(self, unique_id: str) -> Optional[CatalogNode]:
1156
+ """
1157
+ Get a catalog node by unique ID.
1158
+
1159
+ Args:
1160
+ unique_id: Unique node ID
1161
+
1162
+ Returns:
1163
+ CatalogNode or None if not found
1164
+ """
1165
+ result = self.conn.execute("""
1166
+ SELECT unique_id, resource_type, name, schema_name, database,
1167
+ connection_name, adapter_type, description, icon_type, color_hex,
1168
+ materialized, tags, meta, columns, row_count, bytes_stored,
1169
+ created_at, updated_at
1170
+ FROM catalog_nodes
1171
+ WHERE unique_id = ?
1172
+ """, [unique_id]).fetchone()
1173
+
1174
+ if result:
1175
+ return CatalogNode(
1176
+ unique_id=result[0], resource_type=result[1], name=result[2],
1177
+ schema_name=result[3], database=result[4],
1178
+ connection_name=result[5], adapter_type=result[6],
1179
+ description=result[7], icon_type=result[8], color_hex=result[9],
1180
+ materialized=result[10], tags=result[11], meta=result[12],
1181
+ columns=result[13], row_count=result[14], bytes_stored=result[15],
1182
+ created_at=result[16], updated_at=result[17]
1183
+ )
1184
+ return None
1185
+
1186
+ def get_catalog_nodes_by_type(self, resource_type: str) -> List[CatalogNode]:
1187
+ """
1188
+ Get all catalog nodes of a specific type.
1189
+
1190
+ Args:
1191
+ resource_type: Type filter ('model', 'source', etc.)
1192
+
1193
+ Returns:
1194
+ List of CatalogNode objects
1195
+ """
1196
+ results = self.conn.execute("""
1197
+ SELECT unique_id, resource_type, name, schema_name, database,
1198
+ connection_name, adapter_type, description, icon_type, color_hex,
1199
+ materialized, tags, meta, columns, row_count, bytes_stored,
1200
+ created_at, updated_at
1201
+ FROM catalog_nodes
1202
+ WHERE resource_type = ?
1203
+ ORDER BY name
1204
+ """, [resource_type]).fetchall()
1205
+
1206
+ return [
1207
+ CatalogNode(
1208
+ unique_id=r[0], resource_type=r[1], name=r[2],
1209
+ schema_name=r[3], database=r[4],
1210
+ connection_name=r[5], adapter_type=r[6],
1211
+ description=r[7], icon_type=r[8], color_hex=r[9],
1212
+ materialized=r[10], tags=r[11], meta=r[12],
1213
+ columns=r[13], row_count=r[14], bytes_stored=r[15],
1214
+ created_at=r[16], updated_at=r[17]
1215
+ )
1216
+ for r in results
1217
+ ]
1218
+
1219
+ def get_all_catalog_nodes(self) -> List[CatalogNode]:
1220
+ """
1221
+ Get all catalog nodes.
1222
+
1223
+ Returns:
1224
+ List of CatalogNode objects
1225
+ """
1226
+ results = self.conn.execute("""
1227
+ SELECT unique_id, resource_type, name, schema_name, database,
1228
+ connection_name, adapter_type, description, icon_type, color_hex,
1229
+ materialized, tags, meta, columns, row_count, bytes_stored,
1230
+ created_at, updated_at
1231
+ FROM catalog_nodes
1232
+ ORDER BY resource_type, name
1233
+ """).fetchall()
1234
+
1235
+ return [
1236
+ CatalogNode(
1237
+ unique_id=r[0], resource_type=r[1], name=r[2],
1238
+ schema_name=r[3], database=r[4],
1239
+ connection_name=r[5], adapter_type=r[6],
1240
+ description=r[7], icon_type=r[8], color_hex=r[9],
1241
+ materialized=r[10], tags=r[11], meta=r[12],
1242
+ columns=r[13], row_count=r[14], bytes_stored=r[15],
1243
+ created_at=r[16], updated_at=r[17]
1244
+ )
1245
+ for r in results
1246
+ ]
1247
+
1248
+ def search_catalog_nodes(self, query: str) -> List[CatalogNode]:
1249
+ """
1250
+ Search catalog nodes by name or description.
1251
+
1252
+ Args:
1253
+ query: Search query string
1254
+
1255
+ Returns:
1256
+ List of matching CatalogNode objects
1257
+ """
1258
+ search_pattern = f"%{query}%"
1259
+ results = self.conn.execute("""
1260
+ SELECT unique_id, resource_type, name, schema_name, database,
1261
+ connection_name, adapter_type, description, icon_type, color_hex,
1262
+ materialized, tags, meta, columns, row_count, bytes_stored,
1263
+ created_at, updated_at
1264
+ FROM catalog_nodes
1265
+ WHERE name ILIKE ? OR description ILIKE ? OR unique_id ILIKE ?
1266
+ ORDER BY resource_type, name
1267
+ """, [search_pattern, search_pattern, search_pattern]).fetchall()
1268
+
1269
+ return [
1270
+ CatalogNode(
1271
+ unique_id=r[0], resource_type=r[1], name=r[2],
1272
+ schema_name=r[3], database=r[4],
1273
+ connection_name=r[5], adapter_type=r[6],
1274
+ description=r[7], icon_type=r[8], color_hex=r[9],
1275
+ materialized=r[10], tags=r[11], meta=r[12],
1276
+ columns=r[13], row_count=r[14], bytes_stored=r[15],
1277
+ created_at=r[16], updated_at=r[17]
1278
+ )
1279
+ for r in results
1280
+ ]
1281
+
1282
+ def clear_catalog_nodes(self) -> None:
1283
+ """Clear all catalog nodes."""
1284
+ self.conn.execute("DELETE FROM catalog_nodes")
1285
+
1286
+ # =========================================================================
1287
+ # Lineage Edge Operations (v0.56.0 - dvt docs generate enhancement)
1288
+ # =========================================================================
1289
+
1290
+ def save_lineage_edge(self, edge: LineageEdge) -> int:
1291
+ """
1292
+ Save a lineage edge to the store.
1293
+
1294
+ Args:
1295
+ edge: LineageEdge object
1296
+
1297
+ Returns:
1298
+ ID of the inserted edge
1299
+ """
1300
+ if edge.id:
1301
+ self.conn.execute("""
1302
+ INSERT OR REPLACE INTO lineage_edges
1303
+ (id, source_node_id, target_node_id, edge_type,
1304
+ is_cross_connection, source_connection, target_connection)
1305
+ VALUES (?, ?, ?, ?, ?, ?, ?)
1306
+ """, [
1307
+ edge.id, edge.source_node_id, edge.target_node_id, edge.edge_type,
1308
+ edge.is_cross_connection, edge.source_connection, edge.target_connection
1309
+ ])
1310
+ return edge.id
1311
+ else:
1312
+ result = self.conn.execute("""
1313
+ INSERT INTO lineage_edges
1314
+ (source_node_id, target_node_id, edge_type,
1315
+ is_cross_connection, source_connection, target_connection)
1316
+ VALUES (?, ?, ?, ?, ?, ?)
1317
+ RETURNING id
1318
+ """, [
1319
+ edge.source_node_id, edge.target_node_id, edge.edge_type,
1320
+ edge.is_cross_connection, edge.source_connection, edge.target_connection
1321
+ ]).fetchone()
1322
+ return result[0]
1323
+
1324
+ def save_lineage_edges_batch(self, edges: List[LineageEdge]) -> None:
1325
+ """
1326
+ Save multiple lineage edges in a batch.
1327
+
1328
+ Args:
1329
+ edges: List of LineageEdge objects
1330
+ """
1331
+ for edge in edges:
1332
+ self.save_lineage_edge(edge)
1333
+
1334
+ def get_lineage_edge(self, edge_id: int) -> Optional[LineageEdge]:
1335
+ """
1336
+ Get a lineage edge by ID.
1337
+
1338
+ Args:
1339
+ edge_id: Edge ID
1340
+
1341
+ Returns:
1342
+ LineageEdge or None if not found
1343
+ """
1344
+ result = self.conn.execute("""
1345
+ SELECT id, source_node_id, target_node_id, edge_type,
1346
+ is_cross_connection, source_connection, target_connection
1347
+ FROM lineage_edges
1348
+ WHERE id = ?
1349
+ """, [edge_id]).fetchone()
1350
+
1351
+ if result:
1352
+ return LineageEdge(
1353
+ id=result[0], source_node_id=result[1], target_node_id=result[2],
1354
+ edge_type=result[3], is_cross_connection=result[4],
1355
+ source_connection=result[5], target_connection=result[6]
1356
+ )
1357
+ return None
1358
+
1359
+ def get_upstream_edges(self, node_id: str) -> List[LineageEdge]:
1360
+ """
1361
+ Get all edges where this node is the target (upstream dependencies).
1362
+
1363
+ Args:
1364
+ node_id: Node unique ID
1365
+
1366
+ Returns:
1367
+ List of LineageEdge objects
1368
+ """
1369
+ results = self.conn.execute("""
1370
+ SELECT id, source_node_id, target_node_id, edge_type,
1371
+ is_cross_connection, source_connection, target_connection
1372
+ FROM lineage_edges
1373
+ WHERE target_node_id = ?
1374
+ """, [node_id]).fetchall()
1375
+
1376
+ return [
1377
+ LineageEdge(
1378
+ id=r[0], source_node_id=r[1], target_node_id=r[2],
1379
+ edge_type=r[3], is_cross_connection=r[4],
1380
+ source_connection=r[5], target_connection=r[6]
1381
+ )
1382
+ for r in results
1383
+ ]
1384
+
1385
+ def get_downstream_edges(self, node_id: str) -> List[LineageEdge]:
1386
+ """
1387
+ Get all edges where this node is the source (downstream dependents).
1388
+
1389
+ Args:
1390
+ node_id: Node unique ID
1391
+
1392
+ Returns:
1393
+ List of LineageEdge objects
1394
+ """
1395
+ results = self.conn.execute("""
1396
+ SELECT id, source_node_id, target_node_id, edge_type,
1397
+ is_cross_connection, source_connection, target_connection
1398
+ FROM lineage_edges
1399
+ WHERE source_node_id = ?
1400
+ """, [node_id]).fetchall()
1401
+
1402
+ return [
1403
+ LineageEdge(
1404
+ id=r[0], source_node_id=r[1], target_node_id=r[2],
1405
+ edge_type=r[3], is_cross_connection=r[4],
1406
+ source_connection=r[5], target_connection=r[6]
1407
+ )
1408
+ for r in results
1409
+ ]
1410
+
1411
+ def get_all_lineage_edges(self) -> List[LineageEdge]:
1412
+ """
1413
+ Get all lineage edges.
1414
+
1415
+ Returns:
1416
+ List of LineageEdge objects
1417
+ """
1418
+ results = self.conn.execute("""
1419
+ SELECT id, source_node_id, target_node_id, edge_type,
1420
+ is_cross_connection, source_connection, target_connection
1421
+ FROM lineage_edges
1422
+ ORDER BY source_node_id, target_node_id
1423
+ """).fetchall()
1424
+
1425
+ return [
1426
+ LineageEdge(
1427
+ id=r[0], source_node_id=r[1], target_node_id=r[2],
1428
+ edge_type=r[3], is_cross_connection=r[4],
1429
+ source_connection=r[5], target_connection=r[6]
1430
+ )
1431
+ for r in results
1432
+ ]
1433
+
1434
+ def get_cross_connection_edges(self) -> List[LineageEdge]:
1435
+ """
1436
+ Get all edges that cross connection boundaries.
1437
+
1438
+ Returns:
1439
+ List of cross-connection LineageEdge objects
1440
+ """
1441
+ results = self.conn.execute("""
1442
+ SELECT id, source_node_id, target_node_id, edge_type,
1443
+ is_cross_connection, source_connection, target_connection
1444
+ FROM lineage_edges
1445
+ WHERE is_cross_connection = TRUE
1446
+ ORDER BY source_node_id, target_node_id
1447
+ """).fetchall()
1448
+
1449
+ return [
1450
+ LineageEdge(
1451
+ id=r[0], source_node_id=r[1], target_node_id=r[2],
1452
+ edge_type=r[3], is_cross_connection=r[4],
1453
+ source_connection=r[5], target_connection=r[6]
1454
+ )
1455
+ for r in results
1456
+ ]
1457
+
1458
+ def get_lineage_graph(self) -> Dict[str, Any]:
1459
+ """
1460
+ Get the full lineage graph as a dict suitable for visualization.
1461
+
1462
+ Returns:
1463
+ Dict with 'nodes' and 'edges' keys
1464
+ """
1465
+ nodes = self.get_all_catalog_nodes()
1466
+ edges = self.get_all_lineage_edges()
1467
+
1468
+ return {
1469
+ "nodes": [
1470
+ {
1471
+ "id": n.unique_id,
1472
+ "type": n.resource_type,
1473
+ "name": n.name,
1474
+ "connection": n.connection_name,
1475
+ "adapter": n.adapter_type,
1476
+ "icon": n.icon_type,
1477
+ "color": n.color_hex,
1478
+ }
1479
+ for n in nodes
1480
+ ],
1481
+ "edges": [
1482
+ {
1483
+ "source": e.source_node_id,
1484
+ "target": e.target_node_id,
1485
+ "type": e.edge_type,
1486
+ "cross_connection": e.is_cross_connection,
1487
+ }
1488
+ for e in edges
1489
+ ],
1490
+ }
1491
+
1492
+ def clear_lineage_edges(self) -> None:
1493
+ """Clear all lineage edges."""
1494
+ self.conn.execute("DELETE FROM lineage_edges")
1495
+
1496
+ def clear_catalog_and_lineage(self) -> None:
1497
+ """Clear both catalog nodes and lineage edges."""
1498
+ self.clear_lineage_edges()
1499
+ self.clear_catalog_nodes()