dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1020 @@
1
+ # =============================================================================
2
+ # DVT Project Metastore
3
+ # =============================================================================
4
+ # DuckDB-based metastore for DVT project RUNTIME data.
5
+ #
6
+ # RUNTIME/OPERATIONAL DATA (populated during execution):
7
+ # - column_metadata: Column schema info from federated runs
8
+ # - row_counts: Cached row counts from dvt snap
9
+ # - profile_results: Data profiling from dvt profile
10
+ #
11
+ # NOTE: Catalog data (targets, sources, models, lineage) is now stored
12
+ # in a SEPARATE catalog.duckdb file. See CatalogStore class.
13
+ #
14
+ # Static registry data (type mappings, syntax rules, adapter queries) comes
15
+ # from the shipped adapters_registry.duckdb via AdaptersRegistry class.
16
+ #
17
+ # Location: <project>/.dvt/metastore.duckdb
18
+ #
19
+ # DVT v0.54.0: Initial implementation
20
+ # DVT v0.55.0: Refactored to separate project metadata from shipped registry
21
+ # DVT v0.59.0: Separated into metastore.duckdb (runtime) and catalog.duckdb
22
+ # (project catalog). Renamed from metadata_store.duckdb.
23
+ # =============================================================================
24
+
25
+ import os
26
+ from pathlib import Path
27
+ from typing import Any, Dict, List, Optional, Tuple
28
+ from dataclasses import dataclass
29
+ from datetime import datetime
30
+
31
+ try:
32
+ import duckdb
33
+ HAS_DUCKDB = True
34
+ except ImportError:
35
+ HAS_DUCKDB = False
36
+
37
+ from dbt.compute.metadata.adapters_registry import (
38
+ AdaptersRegistry,
39
+ TypeMapping,
40
+ SyntaxRule,
41
+ get_registry,
42
+ get_spark_type as registry_get_spark_type,
43
+ get_syntax_rule as registry_get_syntax_rule,
44
+ get_metadata_query as registry_get_metadata_query,
45
+ )
46
+
47
+
48
+ @dataclass
49
+ class ColumnMetadata:
50
+ """Metadata for a single column."""
51
+ column_name: str
52
+ adapter_type: str
53
+ spark_type: str
54
+ is_nullable: bool
55
+ is_primary_key: bool
56
+ ordinal_position: int
57
+
58
+
59
+ @dataclass
60
+ class TableMetadata:
61
+ """Metadata for a table/view (columns only, no row count)."""
62
+ source_name: str
63
+ table_name: str
64
+ adapter_name: str
65
+ connection_name: str
66
+ schema_name: str
67
+ columns: List[ColumnMetadata]
68
+ last_refreshed: datetime
69
+
70
+
71
+ @dataclass
72
+ class RowCountInfo:
73
+ """Row count information for a table."""
74
+ source_name: str
75
+ table_name: str
76
+ row_count: int
77
+ last_refreshed: datetime
78
+
79
+
80
+ # =============================================================================
81
+ # Profile Results (v0.56.0 - dvt profile command)
82
+ # =============================================================================
83
+
84
+ @dataclass
85
+ class ColumnProfileResult:
86
+ """Profile result for a single column."""
87
+ source_name: str
88
+ table_name: str
89
+ column_name: str
90
+ profile_mode: str # 'minimal', 'explorative', 'sensitive', 'time-series'
91
+
92
+ # Basic metrics (all modes)
93
+ row_count: Optional[int] = None
94
+ null_count: Optional[int] = None
95
+ null_percent: Optional[float] = None
96
+ distinct_count: Optional[int] = None
97
+ distinct_percent: Optional[float] = None
98
+
99
+ # Numeric metrics (explorative+)
100
+ min_value: Optional[float] = None
101
+ max_value: Optional[float] = None
102
+ mean_value: Optional[float] = None
103
+ median_value: Optional[float] = None
104
+ stddev_value: Optional[float] = None
105
+ p25: Optional[float] = None
106
+ p50: Optional[float] = None
107
+ p75: Optional[float] = None
108
+
109
+ # String metrics (explorative+)
110
+ min_length: Optional[int] = None
111
+ max_length: Optional[int] = None
112
+ avg_length: Optional[float] = None
113
+
114
+ # Distribution data (JSON strings)
115
+ histogram: Optional[str] = None # JSON: bucket counts
116
+ top_values: Optional[str] = None # JSON: top N values with counts
117
+
118
+ # Quality alerts (JSON string)
119
+ alerts: Optional[str] = None # JSON: [{type, severity, message}]
120
+
121
+ # Metadata
122
+ profiled_at: Optional[datetime] = None
123
+ duration_ms: Optional[int] = None
124
+
125
+
126
+ # NOTE: CatalogNode, LineageEdge, TargetDefinition, SourceTableDefinition,
127
+ # ModelDefinition are now in catalog_store.py (v0.59.0 refactor)
128
+
129
+
130
+ class ProjectMetadataStore:
131
+ """
132
+ DuckDB-based metastore for a DVT project runtime data.
133
+
134
+ Location: <project_root>/.dvt/metastore.duckdb
135
+
136
+ Tables (runtime/operational data):
137
+ - column_metadata: Schema info from federated runs
138
+ - row_counts: Cached row counts from dvt snap
139
+ - profile_results: Data profiling from dvt profile
140
+
141
+ NOTE: Catalog data (targets, sources, models, lineage) is stored in
142
+ a separate catalog.duckdb file. See CatalogStore class.
143
+
144
+ NOTE: Static registry data (type mappings, syntax rules, adapter queries)
145
+ comes from the shipped adapters_registry.duckdb via AdaptersRegistry class.
146
+ """
147
+
148
+ DVT_DIR = ".dvt"
149
+ METADATA_DB = "metastore.duckdb" # Renamed from metadata_store.duckdb in v0.59.0
150
+
151
+ def __init__(self, project_root: Path):
152
+ """
153
+ Initialize the metadata store.
154
+
155
+ Args:
156
+ project_root: Path to the DVT project root directory
157
+ """
158
+ if not HAS_DUCKDB:
159
+ raise ImportError(
160
+ "DuckDB is required for metadata store. "
161
+ "Install with: pip install duckdb"
162
+ )
163
+
164
+ self.project_root = Path(project_root)
165
+ self.dvt_dir = self.project_root / self.DVT_DIR
166
+ self.db_path = self.dvt_dir / self.METADATA_DB
167
+ self._conn: Optional[duckdb.DuckDBPyConnection] = None
168
+ self._registry: Optional[AdaptersRegistry] = None
169
+
170
+ @property
171
+ def conn(self) -> "duckdb.DuckDBPyConnection":
172
+ """Get or create database connection."""
173
+ if self._conn is None:
174
+ self._conn = duckdb.connect(str(self.db_path))
175
+ return self._conn
176
+
177
+ @property
178
+ def registry(self) -> AdaptersRegistry:
179
+ """Get the shipped adapters registry (singleton)."""
180
+ if self._registry is None:
181
+ self._registry = get_registry()
182
+ return self._registry
183
+
184
+ def close(self) -> None:
185
+ """Close the database connection."""
186
+ if self._conn is not None:
187
+ self._conn.close()
188
+ self._conn = None
189
+
190
+ def __enter__(self) -> "ProjectMetadataStore":
191
+ return self
192
+
193
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
194
+ self.close()
195
+
196
+ # =========================================================================
197
+ # Initialization
198
+ # =========================================================================
199
+
200
+ def initialize(self, drop_existing: bool = True) -> None:
201
+ """
202
+ Initialize the metadata store.
203
+
204
+ Creates:
205
+ 1. .dvt/ directory if it doesn't exist
206
+ 2. metastore.duckdb database
207
+ 3. Schema tables (column_metadata, row_counts, profile_results)
208
+
209
+ Args:
210
+ drop_existing: If True, drops existing tables and recreates them
211
+ with empty schemas. Default is True to ensure clean
212
+ initialization on each `dvt init`.
213
+
214
+ NOTE: No registry data is loaded - that comes from shipped DuckDB.
215
+ """
216
+ # Create .dvt/ directory
217
+ self.dvt_dir.mkdir(parents=True, exist_ok=True)
218
+
219
+ # Drop existing tables if requested (for clean init)
220
+ if drop_existing:
221
+ self._drop_all_tables()
222
+
223
+ # Create schema tables
224
+ self._create_schema()
225
+
226
+ def _drop_all_tables(self) -> None:
227
+ """Drop all metastore tables to reset to empty state."""
228
+ tables = [
229
+ "profile_results",
230
+ "row_counts",
231
+ "column_metadata",
232
+ ]
233
+ for table in tables:
234
+ self.conn.execute(f"DROP TABLE IF EXISTS {table}")
235
+
236
+ def _create_schema(self) -> None:
237
+ """Create the database schema tables."""
238
+
239
+ # Column metadata table (populated by dvt snap or federated runs)
240
+ self.conn.execute("""
241
+ CREATE TABLE IF NOT EXISTS column_metadata (
242
+ source_name VARCHAR NOT NULL,
243
+ table_name VARCHAR NOT NULL,
244
+ column_name VARCHAR NOT NULL,
245
+ adapter_name VARCHAR NOT NULL,
246
+ connection_name VARCHAR NOT NULL,
247
+ schema_name VARCHAR,
248
+ adapter_type VARCHAR NOT NULL,
249
+ spark_type VARCHAR NOT NULL,
250
+ is_nullable BOOLEAN DEFAULT TRUE,
251
+ is_primary_key BOOLEAN DEFAULT FALSE,
252
+ ordinal_position INTEGER,
253
+ last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
254
+ PRIMARY KEY(source_name, table_name, column_name)
255
+ )
256
+ """)
257
+
258
+ # Row counts table (ONLY populated by dvt snap, not during runs)
259
+ self.conn.execute("""
260
+ CREATE TABLE IF NOT EXISTS row_counts (
261
+ source_name VARCHAR NOT NULL,
262
+ table_name VARCHAR NOT NULL,
263
+ row_count BIGINT,
264
+ last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
265
+ PRIMARY KEY(source_name, table_name)
266
+ )
267
+ """)
268
+
269
+ # =====================================================================
270
+ # v0.56.0: Profile Results (dvt profile command)
271
+ # =====================================================================
272
+ self.conn.execute("""
273
+ CREATE TABLE IF NOT EXISTS profile_results (
274
+ source_name VARCHAR NOT NULL,
275
+ table_name VARCHAR NOT NULL,
276
+ column_name VARCHAR NOT NULL,
277
+ profile_mode VARCHAR NOT NULL,
278
+
279
+ -- Basic metrics (all modes)
280
+ row_count BIGINT,
281
+ null_count BIGINT,
282
+ null_percent DOUBLE,
283
+ distinct_count BIGINT,
284
+ distinct_percent DOUBLE,
285
+
286
+ -- Numeric metrics (explorative+)
287
+ min_value DOUBLE,
288
+ max_value DOUBLE,
289
+ mean_value DOUBLE,
290
+ median_value DOUBLE,
291
+ stddev_value DOUBLE,
292
+ p25 DOUBLE,
293
+ p50 DOUBLE,
294
+ p75 DOUBLE,
295
+
296
+ -- String metrics (explorative+)
297
+ min_length INTEGER,
298
+ max_length INTEGER,
299
+ avg_length DOUBLE,
300
+
301
+ -- Distribution data (JSON)
302
+ histogram JSON,
303
+ top_values JSON,
304
+
305
+ -- Quality alerts
306
+ alerts JSON,
307
+
308
+ -- Metadata
309
+ profiled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
310
+ duration_ms INTEGER,
311
+
312
+ PRIMARY KEY(source_name, table_name, column_name, profile_mode)
313
+ )
314
+ """)
315
+
316
+ # NOTE: Catalog tables (catalog_nodes, lineage_edges, targets,
317
+ # source_definitions, model_definitions) are now in catalog.duckdb
318
+ # See CatalogStore class (v0.59.0 refactor)
319
+
320
+ # Create indexes for fast lookups
321
+ self.conn.execute("""
322
+ CREATE INDEX IF NOT EXISTS idx_column_metadata_source
323
+ ON column_metadata(source_name, table_name)
324
+ """)
325
+ self.conn.execute("""
326
+ CREATE INDEX IF NOT EXISTS idx_column_metadata_adapter
327
+ ON column_metadata(adapter_name)
328
+ """)
329
+ self.conn.execute("""
330
+ CREATE INDEX IF NOT EXISTS idx_row_counts_source
331
+ ON row_counts(source_name)
332
+ """)
333
+ self.conn.execute("""
334
+ CREATE INDEX IF NOT EXISTS idx_profile_results_table
335
+ ON profile_results(source_name, table_name)
336
+ """)
337
+
338
+ # =========================================================================
339
+ # Type Registry Queries (delegated to shipped AdaptersRegistry)
340
+ # =========================================================================
341
+
342
+ def get_spark_type(
343
+ self,
344
+ adapter_name: str,
345
+ adapter_type: str,
346
+ spark_version: str = "all"
347
+ ) -> Optional[str]:
348
+ """
349
+ Look up the Spark type for an adapter type.
350
+
351
+ Delegates to the shipped AdaptersRegistry.
352
+
353
+ Args:
354
+ adapter_name: Name of the adapter (e.g., 'postgres', 'snowflake')
355
+ adapter_type: Native adapter type (e.g., 'VARCHAR', 'INTEGER')
356
+ spark_version: Target Spark version (default: 'all')
357
+
358
+ Returns:
359
+ Spark type string or None if not found
360
+ """
361
+ mapping = self.registry.get_spark_type(adapter_name, adapter_type, spark_version)
362
+ return mapping.spark_type if mapping else None
363
+
364
+ def get_type_mappings(
365
+ self,
366
+ adapter_name: str,
367
+ spark_version: str = "all"
368
+ ) -> List[Tuple[str, str]]:
369
+ """
370
+ Get all type mappings for an adapter.
371
+
372
+ Delegates to the shipped AdaptersRegistry.
373
+
374
+ Returns:
375
+ List of (adapter_type, spark_type) tuples
376
+ """
377
+ mappings = self.registry.get_all_mappings_for_adapter(adapter_name)
378
+ return [(m.adapter_type, m.spark_type) for m in mappings]
379
+
380
+ # =========================================================================
381
+ # Syntax Registry Queries (delegated to shipped AdaptersRegistry)
382
+ # =========================================================================
383
+
384
+ def get_syntax_rule(self, adapter_name: str) -> Optional[SyntaxRule]:
385
+ """
386
+ Get syntax rules for an adapter.
387
+
388
+ Delegates to the shipped AdaptersRegistry.
389
+
390
+ Args:
391
+ adapter_name: Name of the adapter
392
+
393
+ Returns:
394
+ SyntaxRule or None if not found
395
+ """
396
+ return self.registry.get_syntax_rule(adapter_name)
397
+
398
+ def quote_identifier(self, adapter_name: str, identifier: str) -> str:
399
+ """Quote an identifier for the given adapter."""
400
+ return self.registry.quote_identifier(adapter_name, identifier)
401
+
402
+ # =========================================================================
403
+ # Adapter Metadata Queries (delegated to shipped AdaptersRegistry)
404
+ # =========================================================================
405
+
406
+ def get_metadata_query(
407
+ self,
408
+ adapter_name: str,
409
+ query_type: str
410
+ ) -> Optional[str]:
411
+ """
412
+ Get the metadata query template for an adapter.
413
+
414
+ Delegates to the shipped AdaptersRegistry.
415
+
416
+ Args:
417
+ adapter_name: Name of the adapter
418
+ query_type: Type of query ('columns', 'tables', 'row_count', 'primary_key')
419
+
420
+ Returns:
421
+ Query template string or None if not found
422
+ """
423
+ query = self.registry.get_metadata_query(adapter_name, query_type)
424
+ return query.query_template if query else None
425
+
426
+ # =========================================================================
427
+ # Column Metadata Operations
428
+ # =========================================================================
429
+
430
+ def save_table_metadata(self, metadata: TableMetadata) -> None:
431
+ """
432
+ Save table column metadata to the store.
433
+
434
+ This is called during federated execution to capture schema info.
435
+
436
+ Args:
437
+ metadata: TableMetadata object with column info
438
+ """
439
+ # Delete existing entries for this table
440
+ self.conn.execute("""
441
+ DELETE FROM column_metadata
442
+ WHERE source_name = ? AND table_name = ?
443
+ """, [metadata.source_name, metadata.table_name])
444
+
445
+ # Insert new entries
446
+ for col in metadata.columns:
447
+ self.conn.execute("""
448
+ INSERT INTO column_metadata
449
+ (source_name, table_name, column_name, adapter_name, connection_name,
450
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
451
+ ordinal_position, last_refreshed)
452
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
453
+ """, [
454
+ metadata.source_name,
455
+ metadata.table_name,
456
+ col.column_name,
457
+ metadata.adapter_name,
458
+ metadata.connection_name,
459
+ metadata.schema_name,
460
+ col.adapter_type,
461
+ col.spark_type,
462
+ col.is_nullable,
463
+ col.is_primary_key,
464
+ col.ordinal_position,
465
+ metadata.last_refreshed
466
+ ])
467
+
468
+ def get_table_metadata(
469
+ self,
470
+ source_name: str,
471
+ table_name: str
472
+ ) -> Optional[TableMetadata]:
473
+ """
474
+ Get cached column metadata for a table.
475
+
476
+ Args:
477
+ source_name: Name of the source
478
+ table_name: Name of the table
479
+
480
+ Returns:
481
+ TableMetadata or None if not cached
482
+ """
483
+ results = self.conn.execute("""
484
+ SELECT
485
+ source_name, table_name, column_name, adapter_name, connection_name,
486
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
487
+ ordinal_position, last_refreshed
488
+ FROM column_metadata
489
+ WHERE source_name = ? AND table_name = ?
490
+ ORDER BY ordinal_position
491
+ """, [source_name, table_name]).fetchall()
492
+
493
+ if not results:
494
+ return None
495
+
496
+ # Build column list
497
+ columns = []
498
+ for r in results:
499
+ columns.append(ColumnMetadata(
500
+ column_name=r[2],
501
+ adapter_type=r[6],
502
+ spark_type=r[7],
503
+ is_nullable=r[8],
504
+ is_primary_key=r[9],
505
+ ordinal_position=r[10]
506
+ ))
507
+
508
+ # Build TableMetadata from first row
509
+ first = results[0]
510
+ return TableMetadata(
511
+ source_name=first[0],
512
+ table_name=first[1],
513
+ adapter_name=first[3],
514
+ connection_name=first[4],
515
+ schema_name=first[5],
516
+ columns=columns,
517
+ last_refreshed=first[11]
518
+ )
519
+
520
+ def get_all_sources(self) -> List[Tuple[str, str]]:
521
+ """
522
+ Get all source/table combinations in the store.
523
+
524
+ Returns:
525
+ List of (source_name, table_name) tuples
526
+ """
527
+ results = self.conn.execute("""
528
+ SELECT DISTINCT source_name, table_name
529
+ FROM column_metadata
530
+ ORDER BY source_name, table_name
531
+ """).fetchall()
532
+
533
+ return [(r[0], r[1]) for r in results]
534
+
535
+ def clear_column_metadata(self) -> None:
536
+ """Clear all column metadata."""
537
+ self.conn.execute("DELETE FROM column_metadata")
538
+
539
+ # =========================================================================
540
+ # Row Count Operations (dvt snap only)
541
+ # =========================================================================
542
+
543
+ def save_row_count(
544
+ self,
545
+ source_name: str,
546
+ table_name: str,
547
+ row_count: int,
548
+ last_refreshed: Optional[datetime] = None
549
+ ) -> None:
550
+ """
551
+ Save row count for a table.
552
+
553
+ This is ONLY called by dvt snap, not during regular runs.
554
+
555
+ Args:
556
+ source_name: Name of the source
557
+ table_name: Name of the table
558
+ row_count: Number of rows
559
+ last_refreshed: Timestamp (defaults to now)
560
+ """
561
+ if last_refreshed is None:
562
+ last_refreshed = datetime.now()
563
+
564
+ self.conn.execute("""
565
+ INSERT OR REPLACE INTO row_counts
566
+ (source_name, table_name, row_count, last_refreshed)
567
+ VALUES (?, ?, ?, ?)
568
+ """, [source_name, table_name, row_count, last_refreshed])
569
+
570
+ def get_row_count(self, source_name: str, table_name: str) -> Optional[RowCountInfo]:
571
+ """
572
+ Get cached row count for a table.
573
+
574
+ Args:
575
+ source_name: Name of the source
576
+ table_name: Name of the table
577
+
578
+ Returns:
579
+ RowCountInfo or None if not cached
580
+ """
581
+ result = self.conn.execute("""
582
+ SELECT source_name, table_name, row_count, last_refreshed
583
+ FROM row_counts
584
+ WHERE source_name = ? AND table_name = ?
585
+ """, [source_name, table_name]).fetchone()
586
+
587
+ if result:
588
+ return RowCountInfo(
589
+ source_name=result[0],
590
+ table_name=result[1],
591
+ row_count=result[2],
592
+ last_refreshed=result[3]
593
+ )
594
+ return None
595
+
596
+ def get_all_row_counts(self) -> List[RowCountInfo]:
597
+ """
598
+ Get all cached row counts.
599
+
600
+ Returns:
601
+ List of RowCountInfo objects
602
+ """
603
+ results = self.conn.execute("""
604
+ SELECT source_name, table_name, row_count, last_refreshed
605
+ FROM row_counts
606
+ ORDER BY source_name, table_name
607
+ """).fetchall()
608
+
609
+ return [
610
+ RowCountInfo(
611
+ source_name=r[0],
612
+ table_name=r[1],
613
+ row_count=r[2],
614
+ last_refreshed=r[3]
615
+ )
616
+ for r in results
617
+ ]
618
+
619
+ def clear_row_counts(self) -> None:
620
+ """Clear all row count data."""
621
+ self.conn.execute("DELETE FROM row_counts")
622
+
623
+ def clear_snapshot(self) -> None:
624
+ """Clear all snapshot data (both column metadata and row counts)."""
625
+ self.clear_column_metadata()
626
+ self.clear_row_counts()
627
+
628
+ def clear_all_metadata(self) -> None:
629
+ """Clear ALL metadata from the store (columns, row counts, profiles)."""
630
+ self.clear_column_metadata()
631
+ self.clear_row_counts()
632
+ self.clear_profile_results()
633
+
634
+ def has_source_metadata(self) -> bool:
635
+ """
636
+ Check if there is any source metadata in the store.
637
+
638
+ Used to determine if this is the first run (auto-snapshot needed).
639
+
640
+ Returns:
641
+ True if source metadata exists, False otherwise
642
+ """
643
+ result = self.conn.execute("""
644
+ SELECT COUNT(*) FROM column_metadata
645
+ WHERE source_name NOT LIKE 'model:%'
646
+ """).fetchone()[0]
647
+ return result > 0
648
+
649
+ def has_any_metadata(self) -> bool:
650
+ """
651
+ Check if there is any metadata (sources or models) in the store.
652
+
653
+ Returns:
654
+ True if any metadata exists, False otherwise
655
+ """
656
+ result = self.conn.execute(
657
+ "SELECT COUNT(*) FROM column_metadata"
658
+ ).fetchone()[0]
659
+ return result > 0
660
+
661
+ # =========================================================================
662
+ # Legacy Compatibility - save_table_metadata with row_count
663
+ # =========================================================================
664
+
665
+ def save_table_metadata_with_row_count(
666
+ self,
667
+ source_name: str,
668
+ table_name: str,
669
+ adapter_name: str,
670
+ connection_name: str,
671
+ schema_name: str,
672
+ columns: List[ColumnMetadata],
673
+ row_count: Optional[int],
674
+ last_refreshed: datetime
675
+ ) -> None:
676
+ """
677
+ Save both column metadata and row count (used by dvt snap).
678
+
679
+ Args:
680
+ source_name: Name of the source
681
+ table_name: Name of the table
682
+ adapter_name: Name of the adapter
683
+ connection_name: Name of the connection
684
+ schema_name: Schema name
685
+ columns: List of ColumnMetadata
686
+ row_count: Number of rows (or None)
687
+ last_refreshed: Timestamp
688
+ """
689
+ # Save column metadata
690
+ metadata = TableMetadata(
691
+ source_name=source_name,
692
+ table_name=table_name,
693
+ adapter_name=adapter_name,
694
+ connection_name=connection_name,
695
+ schema_name=schema_name,
696
+ columns=columns,
697
+ last_refreshed=last_refreshed
698
+ )
699
+ self.save_table_metadata(metadata)
700
+
701
+ # Save row count separately (only if provided)
702
+ if row_count is not None:
703
+ self.save_row_count(source_name, table_name, row_count, last_refreshed)
704
+
705
+ # =========================================================================
706
+ # Utility Methods
707
+ # =========================================================================
708
+
709
+ def exists(self) -> bool:
710
+ """Check if the metadata store exists."""
711
+ return self.db_path.exists()
712
+
713
+ def get_stats(self) -> Dict[str, Any]:
714
+ """Get statistics about the metadata store."""
715
+ # Count column metadata
716
+ tables_count = self.conn.execute(
717
+ "SELECT COUNT(DISTINCT source_name || '.' || table_name) FROM column_metadata"
718
+ ).fetchone()[0]
719
+
720
+ columns_count = self.conn.execute(
721
+ "SELECT COUNT(*) FROM column_metadata"
722
+ ).fetchone()[0]
723
+
724
+ # Count row counts
725
+ row_counts_count = self.conn.execute(
726
+ "SELECT COUNT(*) FROM row_counts"
727
+ ).fetchone()[0]
728
+
729
+ # Get registry stats
730
+ registry = self.registry
731
+ adapters = registry.get_supported_adapters()
732
+
733
+ return {
734
+ "metadata_tables": tables_count,
735
+ "metadata_columns": columns_count,
736
+ "row_counts_cached": row_counts_count,
737
+ "registry_adapters": len(adapters),
738
+ "supported_adapters": adapters,
739
+ "db_path": str(self.db_path),
740
+ }
741
+
742
+ def migrate_from_legacy(self) -> bool:
743
+ """
744
+ Migrate from legacy metadata.duckdb format to new format.
745
+
746
+ Returns:
747
+ True if migration was performed, False if not needed
748
+ """
749
+ legacy_path = self.dvt_dir / "metadata.duckdb"
750
+ if not legacy_path.exists():
751
+ return False
752
+
753
+ # Check if new store already exists
754
+ if self.db_path.exists():
755
+ return False
756
+
757
+ try:
758
+ # Connect to legacy database
759
+ legacy_conn = duckdb.connect(str(legacy_path), read_only=True)
760
+
761
+ # Check if metadata_snapshot table exists
762
+ result = legacy_conn.execute("""
763
+ SELECT COUNT(*) FROM information_schema.tables
764
+ WHERE table_name = 'metadata_snapshot'
765
+ """).fetchone()[0]
766
+
767
+ if result == 0:
768
+ legacy_conn.close()
769
+ return False
770
+
771
+ # Initialize new store
772
+ self.initialize()
773
+
774
+ # Migrate metadata_snapshot to column_metadata
775
+ rows = legacy_conn.execute("""
776
+ SELECT DISTINCT
777
+ source_name, table_name, column_name, adapter_name, connection_name,
778
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
779
+ ordinal_position, last_refreshed
780
+ FROM metadata_snapshot
781
+ """).fetchall()
782
+
783
+ for row in rows:
784
+ self.conn.execute("""
785
+ INSERT OR REPLACE INTO column_metadata
786
+ (source_name, table_name, column_name, adapter_name, connection_name,
787
+ schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
788
+ ordinal_position, last_refreshed)
789
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
790
+ """, list(row))
791
+
792
+ # Migrate row_count data (distinct per table)
793
+ row_count_rows = legacy_conn.execute("""
794
+ SELECT DISTINCT source_name, table_name, row_count, MAX(last_refreshed)
795
+ FROM metadata_snapshot
796
+ WHERE row_count IS NOT NULL
797
+ GROUP BY source_name, table_name, row_count
798
+ """).fetchall()
799
+
800
+ for row in row_count_rows:
801
+ if row[2] is not None: # row_count
802
+ self.conn.execute("""
803
+ INSERT OR REPLACE INTO row_counts
804
+ (source_name, table_name, row_count, last_refreshed)
805
+ VALUES (?, ?, ?, ?)
806
+ """, list(row))
807
+
808
+ legacy_conn.close()
809
+ return True
810
+
811
+ except Exception as e:
812
+ print(f"[DVT] Warning: Migration failed: {e}")
813
+ return False
814
+
815
+ # =========================================================================
816
+ # Profile Results Operations (v0.56.0 - dvt profile command)
817
+ # =========================================================================
818
+
819
+ def save_profile_result(self, result: ColumnProfileResult) -> None:
820
+ """
821
+ Save a column profile result to the store.
822
+
823
+ Args:
824
+ result: ColumnProfileResult object
825
+ """
826
+ self.conn.execute("""
827
+ INSERT OR REPLACE INTO profile_results
828
+ (source_name, table_name, column_name, profile_mode,
829
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
830
+ min_value, max_value, mean_value, median_value, stddev_value,
831
+ p25, p50, p75, min_length, max_length, avg_length,
832
+ histogram, top_values, alerts, profiled_at, duration_ms)
833
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
834
+ """, [
835
+ result.source_name, result.table_name, result.column_name, result.profile_mode,
836
+ result.row_count, result.null_count, result.null_percent,
837
+ result.distinct_count, result.distinct_percent,
838
+ result.min_value, result.max_value, result.mean_value,
839
+ result.median_value, result.stddev_value,
840
+ result.p25, result.p50, result.p75,
841
+ result.min_length, result.max_length, result.avg_length,
842
+ result.histogram, result.top_values, result.alerts,
843
+ result.profiled_at or datetime.now(), result.duration_ms
844
+ ])
845
+
846
+ def save_profile_results_batch(self, results: List[ColumnProfileResult]) -> None:
847
+ """
848
+ Save multiple profile results in a batch.
849
+
850
+ Args:
851
+ results: List of ColumnProfileResult objects
852
+ """
853
+ for result in results:
854
+ self.save_profile_result(result)
855
+
856
+ def get_profile_result(
857
+ self,
858
+ source_name: str,
859
+ table_name: str,
860
+ column_name: str,
861
+ profile_mode: str
862
+ ) -> Optional[ColumnProfileResult]:
863
+ """
864
+ Get a profile result for a specific column.
865
+
866
+ Args:
867
+ source_name: Name of the source
868
+ table_name: Name of the table
869
+ column_name: Name of the column
870
+ profile_mode: Profile mode ('minimal', 'explorative', etc.)
871
+
872
+ Returns:
873
+ ColumnProfileResult or None if not found
874
+ """
875
+ result = self.conn.execute("""
876
+ SELECT source_name, table_name, column_name, profile_mode,
877
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
878
+ min_value, max_value, mean_value, median_value, stddev_value,
879
+ p25, p50, p75, min_length, max_length, avg_length,
880
+ histogram, top_values, alerts, profiled_at, duration_ms
881
+ FROM profile_results
882
+ WHERE source_name = ? AND table_name = ? AND column_name = ? AND profile_mode = ?
883
+ """, [source_name, table_name, column_name, profile_mode]).fetchone()
884
+
885
+ if result:
886
+ return ColumnProfileResult(
887
+ source_name=result[0], table_name=result[1],
888
+ column_name=result[2], profile_mode=result[3],
889
+ row_count=result[4], null_count=result[5], null_percent=result[6],
890
+ distinct_count=result[7], distinct_percent=result[8],
891
+ min_value=result[9], max_value=result[10], mean_value=result[11],
892
+ median_value=result[12], stddev_value=result[13],
893
+ p25=result[14], p50=result[15], p75=result[16],
894
+ min_length=result[17], max_length=result[18], avg_length=result[19],
895
+ histogram=result[20], top_values=result[21], alerts=result[22],
896
+ profiled_at=result[23], duration_ms=result[24]
897
+ )
898
+ return None
899
+
900
+ def get_table_profile(
901
+ self,
902
+ source_name: str,
903
+ table_name: str,
904
+ profile_mode: Optional[str] = None
905
+ ) -> List[ColumnProfileResult]:
906
+ """
907
+ Get all profile results for a table.
908
+
909
+ Args:
910
+ source_name: Name of the source
911
+ table_name: Name of the table
912
+ profile_mode: Optional mode filter
913
+
914
+ Returns:
915
+ List of ColumnProfileResult objects
916
+ """
917
+ if profile_mode:
918
+ results = self.conn.execute("""
919
+ SELECT source_name, table_name, column_name, profile_mode,
920
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
921
+ min_value, max_value, mean_value, median_value, stddev_value,
922
+ p25, p50, p75, min_length, max_length, avg_length,
923
+ histogram, top_values, alerts, profiled_at, duration_ms
924
+ FROM profile_results
925
+ WHERE source_name = ? AND table_name = ? AND profile_mode = ?
926
+ ORDER BY column_name
927
+ """, [source_name, table_name, profile_mode]).fetchall()
928
+ else:
929
+ results = self.conn.execute("""
930
+ SELECT source_name, table_name, column_name, profile_mode,
931
+ row_count, null_count, null_percent, distinct_count, distinct_percent,
932
+ min_value, max_value, mean_value, median_value, stddev_value,
933
+ p25, p50, p75, min_length, max_length, avg_length,
934
+ histogram, top_values, alerts, profiled_at, duration_ms
935
+ FROM profile_results
936
+ WHERE source_name = ? AND table_name = ?
937
+ ORDER BY column_name
938
+ """, [source_name, table_name]).fetchall()
939
+
940
+ return [
941
+ ColumnProfileResult(
942
+ source_name=r[0], table_name=r[1], column_name=r[2], profile_mode=r[3],
943
+ row_count=r[4], null_count=r[5], null_percent=r[6],
944
+ distinct_count=r[7], distinct_percent=r[8],
945
+ min_value=r[9], max_value=r[10], mean_value=r[11],
946
+ median_value=r[12], stddev_value=r[13],
947
+ p25=r[14], p50=r[15], p75=r[16],
948
+ min_length=r[17], max_length=r[18], avg_length=r[19],
949
+ histogram=r[20], top_values=r[21], alerts=r[22],
950
+ profiled_at=r[23], duration_ms=r[24]
951
+ )
952
+ for r in results
953
+ ]
954
+
955
+ def get_all_profiled_tables(self) -> List[Tuple[str, str, str, datetime]]:
956
+ """
957
+ Get all profiled tables with their latest profile timestamp.
958
+
959
+ Returns:
960
+ List of (source_name, table_name, profile_mode, profiled_at) tuples
961
+ """
962
+ results = self.conn.execute("""
963
+ SELECT source_name, table_name, profile_mode, MAX(profiled_at) as last_profiled
964
+ FROM profile_results
965
+ GROUP BY source_name, table_name, profile_mode
966
+ ORDER BY source_name, table_name
967
+ """).fetchall()
968
+
969
+ return [(r[0], r[1], r[2], r[3]) for r in results]
970
+
971
+ def get_profile_alerts(self, source_name: Optional[str] = None) -> List[Dict[str, Any]]:
972
+ """
973
+ Get all profile alerts, optionally filtered by source.
974
+
975
+ Args:
976
+ source_name: Optional source filter
977
+
978
+ Returns:
979
+ List of alert dicts with source/table/column info
980
+ """
981
+ import json
982
+
983
+ if source_name:
984
+ results = self.conn.execute("""
985
+ SELECT source_name, table_name, column_name, alerts
986
+ FROM profile_results
987
+ WHERE source_name = ? AND alerts IS NOT NULL
988
+ """, [source_name]).fetchall()
989
+ else:
990
+ results = self.conn.execute("""
991
+ SELECT source_name, table_name, column_name, alerts
992
+ FROM profile_results
993
+ WHERE alerts IS NOT NULL
994
+ """).fetchall()
995
+
996
+ all_alerts = []
997
+ for r in results:
998
+ try:
999
+ alerts = json.loads(r[3]) if r[3] else []
1000
+ for alert in alerts:
1001
+ alert["source_name"] = r[0]
1002
+ alert["table_name"] = r[1]
1003
+ alert["column_name"] = r[2]
1004
+ all_alerts.append(alert)
1005
+ except json.JSONDecodeError:
1006
+ pass
1007
+
1008
+ return all_alerts
1009
+
1010
+ def clear_profile_results(self, source_name: Optional[str] = None) -> None:
1011
+ """
1012
+ Clear profile results, optionally for a specific source.
1013
+
1014
+ Args:
1015
+ source_name: Optional source filter
1016
+ """
1017
+ if source_name:
1018
+ self.conn.execute("DELETE FROM profile_results WHERE source_name = ?", [source_name])
1019
+ else:
1020
+ self.conn.execute("DELETE FROM profile_results")