dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1499 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# DVT Project Metadata Store
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# DuckDB-based metadata store for DVT projects.
|
|
5
|
+
#
|
|
6
|
+
# This store contains PROJECT-LEVEL data only:
|
|
7
|
+
# - Column metadata (from dvt snap or federated runs)
|
|
8
|
+
# - Row counts (from dvt snap only, NOT during every run)
|
|
9
|
+
#
|
|
10
|
+
# Static registry data (type mappings, syntax rules, adapter queries) comes
|
|
11
|
+
# from the shipped adapters_registry.duckdb via AdaptersRegistry class.
|
|
12
|
+
#
|
|
13
|
+
# Location: <project>/.dvt/metadata_store.duckdb
|
|
14
|
+
#
|
|
15
|
+
# DVT v0.54.0: Initial implementation
|
|
16
|
+
# DVT v0.55.0: Refactored to separate project metadata from shipped registry
|
|
17
|
+
# =============================================================================
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import duckdb
|
|
27
|
+
HAS_DUCKDB = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
HAS_DUCKDB = False
|
|
30
|
+
|
|
31
|
+
from dbt.compute.metadata.adapters_registry import (
|
|
32
|
+
AdaptersRegistry,
|
|
33
|
+
TypeMapping,
|
|
34
|
+
SyntaxRule,
|
|
35
|
+
get_registry,
|
|
36
|
+
get_spark_type as registry_get_spark_type,
|
|
37
|
+
get_syntax_rule as registry_get_syntax_rule,
|
|
38
|
+
get_metadata_query as registry_get_metadata_query,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ColumnMetadata:
|
|
44
|
+
"""Metadata for a single column."""
|
|
45
|
+
column_name: str
|
|
46
|
+
adapter_type: str
|
|
47
|
+
spark_type: str
|
|
48
|
+
is_nullable: bool
|
|
49
|
+
is_primary_key: bool
|
|
50
|
+
ordinal_position: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class TableMetadata:
|
|
55
|
+
"""Metadata for a table/view (columns only, no row count)."""
|
|
56
|
+
source_name: str
|
|
57
|
+
table_name: str
|
|
58
|
+
adapter_name: str
|
|
59
|
+
connection_name: str
|
|
60
|
+
schema_name: str
|
|
61
|
+
columns: List[ColumnMetadata]
|
|
62
|
+
last_refreshed: datetime
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class RowCountInfo:
|
|
67
|
+
"""Row count information for a table."""
|
|
68
|
+
source_name: str
|
|
69
|
+
table_name: str
|
|
70
|
+
row_count: int
|
|
71
|
+
last_refreshed: datetime
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# =============================================================================
|
|
75
|
+
# Profile Results (v0.56.0 - dvt profile command)
|
|
76
|
+
# =============================================================================
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class ColumnProfileResult:
|
|
80
|
+
"""Profile result for a single column."""
|
|
81
|
+
source_name: str
|
|
82
|
+
table_name: str
|
|
83
|
+
column_name: str
|
|
84
|
+
profile_mode: str # 'minimal', 'explorative', 'sensitive', 'time-series'
|
|
85
|
+
|
|
86
|
+
# Basic metrics (all modes)
|
|
87
|
+
row_count: Optional[int] = None
|
|
88
|
+
null_count: Optional[int] = None
|
|
89
|
+
null_percent: Optional[float] = None
|
|
90
|
+
distinct_count: Optional[int] = None
|
|
91
|
+
distinct_percent: Optional[float] = None
|
|
92
|
+
|
|
93
|
+
# Numeric metrics (explorative+)
|
|
94
|
+
min_value: Optional[float] = None
|
|
95
|
+
max_value: Optional[float] = None
|
|
96
|
+
mean_value: Optional[float] = None
|
|
97
|
+
median_value: Optional[float] = None
|
|
98
|
+
stddev_value: Optional[float] = None
|
|
99
|
+
p25: Optional[float] = None
|
|
100
|
+
p50: Optional[float] = None
|
|
101
|
+
p75: Optional[float] = None
|
|
102
|
+
|
|
103
|
+
# String metrics (explorative+)
|
|
104
|
+
min_length: Optional[int] = None
|
|
105
|
+
max_length: Optional[int] = None
|
|
106
|
+
avg_length: Optional[float] = None
|
|
107
|
+
|
|
108
|
+
# Distribution data (JSON strings)
|
|
109
|
+
histogram: Optional[str] = None # JSON: bucket counts
|
|
110
|
+
top_values: Optional[str] = None # JSON: top N values with counts
|
|
111
|
+
|
|
112
|
+
# Quality alerts (JSON string)
|
|
113
|
+
alerts: Optional[str] = None # JSON: [{type, severity, message}]
|
|
114
|
+
|
|
115
|
+
# Metadata
|
|
116
|
+
profiled_at: Optional[datetime] = None
|
|
117
|
+
duration_ms: Optional[int] = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# =============================================================================
|
|
121
|
+
# Catalog Nodes (v0.56.0 - dvt docs generate enhancement)
|
|
122
|
+
# =============================================================================
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class CatalogNode:
|
|
126
|
+
"""Enriched catalog node for dvt docs generate."""
|
|
127
|
+
unique_id: str
|
|
128
|
+
resource_type: str # 'model', 'source', 'test', 'seed', 'snapshot'
|
|
129
|
+
name: str
|
|
130
|
+
schema_name: Optional[str] = None
|
|
131
|
+
database: Optional[str] = None
|
|
132
|
+
|
|
133
|
+
# Connection info
|
|
134
|
+
connection_name: Optional[str] = None
|
|
135
|
+
adapter_type: Optional[str] = None
|
|
136
|
+
|
|
137
|
+
# Documentation
|
|
138
|
+
description: Optional[str] = None
|
|
139
|
+
|
|
140
|
+
# Visual enrichment
|
|
141
|
+
icon_type: Optional[str] = None # 'postgres', 'snowflake', 'spark', etc.
|
|
142
|
+
color_hex: Optional[str] = None # Connection color
|
|
143
|
+
|
|
144
|
+
# Config
|
|
145
|
+
materialized: Optional[str] = None
|
|
146
|
+
tags: Optional[str] = None # JSON array
|
|
147
|
+
meta: Optional[str] = None # JSON object
|
|
148
|
+
|
|
149
|
+
# Columns (JSON array)
|
|
150
|
+
columns: Optional[str] = None
|
|
151
|
+
|
|
152
|
+
# Statistics
|
|
153
|
+
row_count: Optional[int] = None
|
|
154
|
+
bytes_stored: Optional[int] = None
|
|
155
|
+
|
|
156
|
+
# Timestamps
|
|
157
|
+
created_at: Optional[datetime] = None
|
|
158
|
+
updated_at: Optional[datetime] = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# =============================================================================
|
|
162
|
+
# Lineage Edges (v0.56.0 - dvt docs generate enhancement)
|
|
163
|
+
# =============================================================================
|
|
164
|
+
|
|
165
|
+
@dataclass
|
|
166
|
+
class LineageEdge:
|
|
167
|
+
"""Lineage edge representing a dependency between nodes."""
|
|
168
|
+
id: Optional[int] = None
|
|
169
|
+
source_node_id: str = ""
|
|
170
|
+
target_node_id: str = ""
|
|
171
|
+
edge_type: str = "" # 'ref', 'source', 'depends_on'
|
|
172
|
+
|
|
173
|
+
# Cross-connection indicator
|
|
174
|
+
is_cross_connection: bool = False
|
|
175
|
+
source_connection: Optional[str] = None
|
|
176
|
+
target_connection: Optional[str] = None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class ProjectMetadataStore:
|
|
180
|
+
"""
|
|
181
|
+
DuckDB-based metadata store for a DVT project.
|
|
182
|
+
|
|
183
|
+
Location: <project_root>/.dvt/metadata_store.duckdb
|
|
184
|
+
|
|
185
|
+
Tables (project-level data only):
|
|
186
|
+
- column_metadata: source_name, table_name, column_name, adapter_type, spark_type, ...
|
|
187
|
+
- row_counts: source_name, table_name, row_count, last_refreshed
|
|
188
|
+
|
|
189
|
+
NOTE: Static registry data (type mappings, syntax rules, adapter queries)
|
|
190
|
+
comes from the shipped adapters_registry.duckdb via AdaptersRegistry class.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
DVT_DIR = ".dvt"
|
|
194
|
+
METADATA_DB = "metadata_store.duckdb"
|
|
195
|
+
|
|
196
|
+
def __init__(self, project_root: Path):
|
|
197
|
+
"""
|
|
198
|
+
Initialize the metadata store.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
project_root: Path to the DVT project root directory
|
|
202
|
+
"""
|
|
203
|
+
if not HAS_DUCKDB:
|
|
204
|
+
raise ImportError(
|
|
205
|
+
"DuckDB is required for metadata store. "
|
|
206
|
+
"Install with: pip install duckdb"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
self.project_root = Path(project_root)
|
|
210
|
+
self.dvt_dir = self.project_root / self.DVT_DIR
|
|
211
|
+
self.db_path = self.dvt_dir / self.METADATA_DB
|
|
212
|
+
self._conn: Optional[duckdb.DuckDBPyConnection] = None
|
|
213
|
+
self._registry: Optional[AdaptersRegistry] = None
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def conn(self) -> "duckdb.DuckDBPyConnection":
|
|
217
|
+
"""Get or create database connection."""
|
|
218
|
+
if self._conn is None:
|
|
219
|
+
self._conn = duckdb.connect(str(self.db_path))
|
|
220
|
+
return self._conn
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def registry(self) -> AdaptersRegistry:
|
|
224
|
+
"""Get the shipped adapters registry (singleton)."""
|
|
225
|
+
if self._registry is None:
|
|
226
|
+
self._registry = get_registry()
|
|
227
|
+
return self._registry
|
|
228
|
+
|
|
229
|
+
def close(self) -> None:
|
|
230
|
+
"""Close the database connection."""
|
|
231
|
+
if self._conn is not None:
|
|
232
|
+
self._conn.close()
|
|
233
|
+
self._conn = None
|
|
234
|
+
|
|
235
|
+
def __enter__(self) -> "ProjectMetadataStore":
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
239
|
+
self.close()
|
|
240
|
+
|
|
241
|
+
# =========================================================================
|
|
242
|
+
# Initialization
|
|
243
|
+
# =========================================================================
|
|
244
|
+
|
|
245
|
+
def initialize(self) -> None:
|
|
246
|
+
"""
|
|
247
|
+
Initialize the metadata store.
|
|
248
|
+
|
|
249
|
+
Creates:
|
|
250
|
+
1. .dvt/ directory if it doesn't exist
|
|
251
|
+
2. metadata_store.duckdb database
|
|
252
|
+
3. Schema tables (column_metadata, row_counts)
|
|
253
|
+
|
|
254
|
+
NOTE: No registry data is loaded - that comes from shipped DuckDB.
|
|
255
|
+
"""
|
|
256
|
+
# Create .dvt/ directory
|
|
257
|
+
self.dvt_dir.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
|
|
259
|
+
# Create schema tables
|
|
260
|
+
self._create_schema()
|
|
261
|
+
|
|
262
|
+
def _create_schema(self) -> None:
|
|
263
|
+
"""Create the database schema tables."""
|
|
264
|
+
|
|
265
|
+
# Column metadata table (populated by dvt snap or federated runs)
|
|
266
|
+
self.conn.execute("""
|
|
267
|
+
CREATE TABLE IF NOT EXISTS column_metadata (
|
|
268
|
+
source_name VARCHAR NOT NULL,
|
|
269
|
+
table_name VARCHAR NOT NULL,
|
|
270
|
+
column_name VARCHAR NOT NULL,
|
|
271
|
+
adapter_name VARCHAR NOT NULL,
|
|
272
|
+
connection_name VARCHAR NOT NULL,
|
|
273
|
+
schema_name VARCHAR,
|
|
274
|
+
adapter_type VARCHAR NOT NULL,
|
|
275
|
+
spark_type VARCHAR NOT NULL,
|
|
276
|
+
is_nullable BOOLEAN DEFAULT TRUE,
|
|
277
|
+
is_primary_key BOOLEAN DEFAULT FALSE,
|
|
278
|
+
ordinal_position INTEGER,
|
|
279
|
+
last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
280
|
+
PRIMARY KEY(source_name, table_name, column_name)
|
|
281
|
+
)
|
|
282
|
+
""")
|
|
283
|
+
|
|
284
|
+
# Row counts table (ONLY populated by dvt snap, not during runs)
|
|
285
|
+
self.conn.execute("""
|
|
286
|
+
CREATE TABLE IF NOT EXISTS row_counts (
|
|
287
|
+
source_name VARCHAR NOT NULL,
|
|
288
|
+
table_name VARCHAR NOT NULL,
|
|
289
|
+
row_count BIGINT,
|
|
290
|
+
last_refreshed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
291
|
+
PRIMARY KEY(source_name, table_name)
|
|
292
|
+
)
|
|
293
|
+
""")
|
|
294
|
+
|
|
295
|
+
# =====================================================================
|
|
296
|
+
# v0.56.0: Profile Results (dvt profile command)
|
|
297
|
+
# =====================================================================
|
|
298
|
+
self.conn.execute("""
|
|
299
|
+
CREATE TABLE IF NOT EXISTS profile_results (
|
|
300
|
+
source_name VARCHAR NOT NULL,
|
|
301
|
+
table_name VARCHAR NOT NULL,
|
|
302
|
+
column_name VARCHAR NOT NULL,
|
|
303
|
+
profile_mode VARCHAR NOT NULL,
|
|
304
|
+
|
|
305
|
+
-- Basic metrics (all modes)
|
|
306
|
+
row_count BIGINT,
|
|
307
|
+
null_count BIGINT,
|
|
308
|
+
null_percent DOUBLE,
|
|
309
|
+
distinct_count BIGINT,
|
|
310
|
+
distinct_percent DOUBLE,
|
|
311
|
+
|
|
312
|
+
-- Numeric metrics (explorative+)
|
|
313
|
+
min_value DOUBLE,
|
|
314
|
+
max_value DOUBLE,
|
|
315
|
+
mean_value DOUBLE,
|
|
316
|
+
median_value DOUBLE,
|
|
317
|
+
stddev_value DOUBLE,
|
|
318
|
+
p25 DOUBLE,
|
|
319
|
+
p50 DOUBLE,
|
|
320
|
+
p75 DOUBLE,
|
|
321
|
+
|
|
322
|
+
-- String metrics (explorative+)
|
|
323
|
+
min_length INTEGER,
|
|
324
|
+
max_length INTEGER,
|
|
325
|
+
avg_length DOUBLE,
|
|
326
|
+
|
|
327
|
+
-- Distribution data (JSON)
|
|
328
|
+
histogram JSON,
|
|
329
|
+
top_values JSON,
|
|
330
|
+
|
|
331
|
+
-- Quality alerts
|
|
332
|
+
alerts JSON,
|
|
333
|
+
|
|
334
|
+
-- Metadata
|
|
335
|
+
profiled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
336
|
+
duration_ms INTEGER,
|
|
337
|
+
|
|
338
|
+
PRIMARY KEY(source_name, table_name, column_name, profile_mode)
|
|
339
|
+
)
|
|
340
|
+
""")
|
|
341
|
+
|
|
342
|
+
# =====================================================================
|
|
343
|
+
# v0.56.0: Catalog Nodes (dvt docs generate enhancement)
|
|
344
|
+
# =====================================================================
|
|
345
|
+
self.conn.execute("""
|
|
346
|
+
CREATE TABLE IF NOT EXISTS catalog_nodes (
|
|
347
|
+
unique_id VARCHAR PRIMARY KEY,
|
|
348
|
+
resource_type VARCHAR NOT NULL,
|
|
349
|
+
name VARCHAR NOT NULL,
|
|
350
|
+
schema_name VARCHAR,
|
|
351
|
+
database VARCHAR,
|
|
352
|
+
|
|
353
|
+
-- Connection info
|
|
354
|
+
connection_name VARCHAR,
|
|
355
|
+
adapter_type VARCHAR,
|
|
356
|
+
|
|
357
|
+
-- Documentation
|
|
358
|
+
description TEXT,
|
|
359
|
+
|
|
360
|
+
-- Visual enrichment
|
|
361
|
+
icon_type VARCHAR,
|
|
362
|
+
color_hex VARCHAR,
|
|
363
|
+
|
|
364
|
+
-- Config
|
|
365
|
+
materialized VARCHAR,
|
|
366
|
+
tags JSON,
|
|
367
|
+
meta JSON,
|
|
368
|
+
|
|
369
|
+
-- Columns (JSON array)
|
|
370
|
+
columns JSON,
|
|
371
|
+
|
|
372
|
+
-- Statistics
|
|
373
|
+
row_count BIGINT,
|
|
374
|
+
bytes_stored BIGINT,
|
|
375
|
+
|
|
376
|
+
-- Timestamps
|
|
377
|
+
created_at TIMESTAMP,
|
|
378
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
379
|
+
)
|
|
380
|
+
""")
|
|
381
|
+
|
|
382
|
+
# =====================================================================
|
|
383
|
+
# v0.56.0: Lineage Edges (dvt docs generate enhancement)
|
|
384
|
+
# =====================================================================
|
|
385
|
+
self.conn.execute("""
|
|
386
|
+
CREATE TABLE IF NOT EXISTS lineage_edges (
|
|
387
|
+
id INTEGER PRIMARY KEY,
|
|
388
|
+
source_node_id VARCHAR NOT NULL,
|
|
389
|
+
target_node_id VARCHAR NOT NULL,
|
|
390
|
+
edge_type VARCHAR NOT NULL,
|
|
391
|
+
|
|
392
|
+
-- Cross-connection indicator
|
|
393
|
+
is_cross_connection BOOLEAN DEFAULT FALSE,
|
|
394
|
+
source_connection VARCHAR,
|
|
395
|
+
target_connection VARCHAR
|
|
396
|
+
)
|
|
397
|
+
""")
|
|
398
|
+
|
|
399
|
+
# Create indexes for fast lookups
|
|
400
|
+
self.conn.execute("""
|
|
401
|
+
CREATE INDEX IF NOT EXISTS idx_column_metadata_source
|
|
402
|
+
ON column_metadata(source_name, table_name)
|
|
403
|
+
""")
|
|
404
|
+
self.conn.execute("""
|
|
405
|
+
CREATE INDEX IF NOT EXISTS idx_column_metadata_adapter
|
|
406
|
+
ON column_metadata(adapter_name)
|
|
407
|
+
""")
|
|
408
|
+
self.conn.execute("""
|
|
409
|
+
CREATE INDEX IF NOT EXISTS idx_row_counts_source
|
|
410
|
+
ON row_counts(source_name)
|
|
411
|
+
""")
|
|
412
|
+
|
|
413
|
+
# v0.56.0: New indexes for profile, catalog, and lineage
|
|
414
|
+
self.conn.execute("""
|
|
415
|
+
CREATE INDEX IF NOT EXISTS idx_profile_results_table
|
|
416
|
+
ON profile_results(source_name, table_name)
|
|
417
|
+
""")
|
|
418
|
+
self.conn.execute("""
|
|
419
|
+
CREATE INDEX IF NOT EXISTS idx_catalog_nodes_type
|
|
420
|
+
ON catalog_nodes(resource_type)
|
|
421
|
+
""")
|
|
422
|
+
self.conn.execute("""
|
|
423
|
+
CREATE INDEX IF NOT EXISTS idx_lineage_edges_source
|
|
424
|
+
ON lineage_edges(source_node_id)
|
|
425
|
+
""")
|
|
426
|
+
self.conn.execute("""
|
|
427
|
+
CREATE INDEX IF NOT EXISTS idx_lineage_edges_target
|
|
428
|
+
ON lineage_edges(target_node_id)
|
|
429
|
+
""")
|
|
430
|
+
|
|
431
|
+
# =========================================================================
|
|
432
|
+
# Type Registry Queries (delegated to shipped AdaptersRegistry)
|
|
433
|
+
# =========================================================================
|
|
434
|
+
|
|
435
|
+
def get_spark_type(
|
|
436
|
+
self,
|
|
437
|
+
adapter_name: str,
|
|
438
|
+
adapter_type: str,
|
|
439
|
+
spark_version: str = "all"
|
|
440
|
+
) -> Optional[str]:
|
|
441
|
+
"""
|
|
442
|
+
Look up the Spark type for an adapter type.
|
|
443
|
+
|
|
444
|
+
Delegates to the shipped AdaptersRegistry.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
adapter_name: Name of the adapter (e.g., 'postgres', 'snowflake')
|
|
448
|
+
adapter_type: Native adapter type (e.g., 'VARCHAR', 'INTEGER')
|
|
449
|
+
spark_version: Target Spark version (default: 'all')
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Spark type string or None if not found
|
|
453
|
+
"""
|
|
454
|
+
mapping = self.registry.get_spark_type(adapter_name, adapter_type, spark_version)
|
|
455
|
+
return mapping.spark_type if mapping else None
|
|
456
|
+
|
|
457
|
+
def get_type_mappings(
|
|
458
|
+
self,
|
|
459
|
+
adapter_name: str,
|
|
460
|
+
spark_version: str = "all"
|
|
461
|
+
) -> List[Tuple[str, str]]:
|
|
462
|
+
"""
|
|
463
|
+
Get all type mappings for an adapter.
|
|
464
|
+
|
|
465
|
+
Delegates to the shipped AdaptersRegistry.
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
List of (adapter_type, spark_type) tuples
|
|
469
|
+
"""
|
|
470
|
+
mappings = self.registry.get_all_mappings_for_adapter(adapter_name)
|
|
471
|
+
return [(m.adapter_type, m.spark_type) for m in mappings]
|
|
472
|
+
|
|
473
|
+
# =========================================================================
|
|
474
|
+
# Syntax Registry Queries (delegated to shipped AdaptersRegistry)
|
|
475
|
+
# =========================================================================
|
|
476
|
+
|
|
477
|
+
def get_syntax_rule(self, adapter_name: str) -> Optional[SyntaxRule]:
|
|
478
|
+
"""
|
|
479
|
+
Get syntax rules for an adapter.
|
|
480
|
+
|
|
481
|
+
Delegates to the shipped AdaptersRegistry.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
adapter_name: Name of the adapter
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
SyntaxRule or None if not found
|
|
488
|
+
"""
|
|
489
|
+
return self.registry.get_syntax_rule(adapter_name)
|
|
490
|
+
|
|
491
|
+
def quote_identifier(self, adapter_name: str, identifier: str) -> str:
|
|
492
|
+
"""Quote an identifier for the given adapter."""
|
|
493
|
+
return self.registry.quote_identifier(adapter_name, identifier)
|
|
494
|
+
|
|
495
|
+
# =========================================================================
|
|
496
|
+
# Adapter Metadata Queries (delegated to shipped AdaptersRegistry)
|
|
497
|
+
# =========================================================================
|
|
498
|
+
|
|
499
|
+
def get_metadata_query(
|
|
500
|
+
self,
|
|
501
|
+
adapter_name: str,
|
|
502
|
+
query_type: str
|
|
503
|
+
) -> Optional[str]:
|
|
504
|
+
"""
|
|
505
|
+
Get the metadata query template for an adapter.
|
|
506
|
+
|
|
507
|
+
Delegates to the shipped AdaptersRegistry.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
adapter_name: Name of the adapter
|
|
511
|
+
query_type: Type of query ('columns', 'tables', 'row_count', 'primary_key')
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Query template string or None if not found
|
|
515
|
+
"""
|
|
516
|
+
query = self.registry.get_metadata_query(adapter_name, query_type)
|
|
517
|
+
return query.query_template if query else None
|
|
518
|
+
|
|
519
|
+
# =========================================================================
|
|
520
|
+
# Column Metadata Operations
|
|
521
|
+
# =========================================================================
|
|
522
|
+
|
|
523
|
+
def save_table_metadata(self, metadata: TableMetadata) -> None:
|
|
524
|
+
"""
|
|
525
|
+
Save table column metadata to the store.
|
|
526
|
+
|
|
527
|
+
This is called during federated execution to capture schema info.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
metadata: TableMetadata object with column info
|
|
531
|
+
"""
|
|
532
|
+
# Delete existing entries for this table
|
|
533
|
+
self.conn.execute("""
|
|
534
|
+
DELETE FROM column_metadata
|
|
535
|
+
WHERE source_name = ? AND table_name = ?
|
|
536
|
+
""", [metadata.source_name, metadata.table_name])
|
|
537
|
+
|
|
538
|
+
# Insert new entries
|
|
539
|
+
for col in metadata.columns:
|
|
540
|
+
self.conn.execute("""
|
|
541
|
+
INSERT INTO column_metadata
|
|
542
|
+
(source_name, table_name, column_name, adapter_name, connection_name,
|
|
543
|
+
schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
|
|
544
|
+
ordinal_position, last_refreshed)
|
|
545
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
546
|
+
""", [
|
|
547
|
+
metadata.source_name,
|
|
548
|
+
metadata.table_name,
|
|
549
|
+
col.column_name,
|
|
550
|
+
metadata.adapter_name,
|
|
551
|
+
metadata.connection_name,
|
|
552
|
+
metadata.schema_name,
|
|
553
|
+
col.adapter_type,
|
|
554
|
+
col.spark_type,
|
|
555
|
+
col.is_nullable,
|
|
556
|
+
col.is_primary_key,
|
|
557
|
+
col.ordinal_position,
|
|
558
|
+
metadata.last_refreshed
|
|
559
|
+
])
|
|
560
|
+
|
|
561
|
+
def get_table_metadata(
|
|
562
|
+
self,
|
|
563
|
+
source_name: str,
|
|
564
|
+
table_name: str
|
|
565
|
+
) -> Optional[TableMetadata]:
|
|
566
|
+
"""
|
|
567
|
+
Get cached column metadata for a table.
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
source_name: Name of the source
|
|
571
|
+
table_name: Name of the table
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
TableMetadata or None if not cached
|
|
575
|
+
"""
|
|
576
|
+
results = self.conn.execute("""
|
|
577
|
+
SELECT
|
|
578
|
+
source_name, table_name, column_name, adapter_name, connection_name,
|
|
579
|
+
schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
|
|
580
|
+
ordinal_position, last_refreshed
|
|
581
|
+
FROM column_metadata
|
|
582
|
+
WHERE source_name = ? AND table_name = ?
|
|
583
|
+
ORDER BY ordinal_position
|
|
584
|
+
""", [source_name, table_name]).fetchall()
|
|
585
|
+
|
|
586
|
+
if not results:
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
# Build column list
|
|
590
|
+
columns = []
|
|
591
|
+
for r in results:
|
|
592
|
+
columns.append(ColumnMetadata(
|
|
593
|
+
column_name=r[2],
|
|
594
|
+
adapter_type=r[6],
|
|
595
|
+
spark_type=r[7],
|
|
596
|
+
is_nullable=r[8],
|
|
597
|
+
is_primary_key=r[9],
|
|
598
|
+
ordinal_position=r[10]
|
|
599
|
+
))
|
|
600
|
+
|
|
601
|
+
# Build TableMetadata from first row
|
|
602
|
+
first = results[0]
|
|
603
|
+
return TableMetadata(
|
|
604
|
+
source_name=first[0],
|
|
605
|
+
table_name=first[1],
|
|
606
|
+
adapter_name=first[3],
|
|
607
|
+
connection_name=first[4],
|
|
608
|
+
schema_name=first[5],
|
|
609
|
+
columns=columns,
|
|
610
|
+
last_refreshed=first[11]
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def get_all_sources(self) -> List[Tuple[str, str]]:
|
|
614
|
+
"""
|
|
615
|
+
Get all source/table combinations in the store.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
List of (source_name, table_name) tuples
|
|
619
|
+
"""
|
|
620
|
+
results = self.conn.execute("""
|
|
621
|
+
SELECT DISTINCT source_name, table_name
|
|
622
|
+
FROM column_metadata
|
|
623
|
+
ORDER BY source_name, table_name
|
|
624
|
+
""").fetchall()
|
|
625
|
+
|
|
626
|
+
return [(r[0], r[1]) for r in results]
|
|
627
|
+
|
|
628
|
+
def clear_column_metadata(self) -> None:
|
|
629
|
+
"""Clear all column metadata."""
|
|
630
|
+
self.conn.execute("DELETE FROM column_metadata")
|
|
631
|
+
|
|
632
|
+
# =========================================================================
|
|
633
|
+
# Row Count Operations (dvt snap only)
|
|
634
|
+
# =========================================================================
|
|
635
|
+
|
|
636
|
+
def save_row_count(
|
|
637
|
+
self,
|
|
638
|
+
source_name: str,
|
|
639
|
+
table_name: str,
|
|
640
|
+
row_count: int,
|
|
641
|
+
last_refreshed: Optional[datetime] = None
|
|
642
|
+
) -> None:
|
|
643
|
+
"""
|
|
644
|
+
Save row count for a table.
|
|
645
|
+
|
|
646
|
+
This is ONLY called by dvt snap, not during regular runs.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
source_name: Name of the source
|
|
650
|
+
table_name: Name of the table
|
|
651
|
+
row_count: Number of rows
|
|
652
|
+
last_refreshed: Timestamp (defaults to now)
|
|
653
|
+
"""
|
|
654
|
+
if last_refreshed is None:
|
|
655
|
+
last_refreshed = datetime.now()
|
|
656
|
+
|
|
657
|
+
self.conn.execute("""
|
|
658
|
+
INSERT OR REPLACE INTO row_counts
|
|
659
|
+
(source_name, table_name, row_count, last_refreshed)
|
|
660
|
+
VALUES (?, ?, ?, ?)
|
|
661
|
+
""", [source_name, table_name, row_count, last_refreshed])
|
|
662
|
+
|
|
663
|
+
def get_row_count(self, source_name: str, table_name: str) -> Optional[RowCountInfo]:
|
|
664
|
+
"""
|
|
665
|
+
Get cached row count for a table.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
source_name: Name of the source
|
|
669
|
+
table_name: Name of the table
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
RowCountInfo or None if not cached
|
|
673
|
+
"""
|
|
674
|
+
result = self.conn.execute("""
|
|
675
|
+
SELECT source_name, table_name, row_count, last_refreshed
|
|
676
|
+
FROM row_counts
|
|
677
|
+
WHERE source_name = ? AND table_name = ?
|
|
678
|
+
""", [source_name, table_name]).fetchone()
|
|
679
|
+
|
|
680
|
+
if result:
|
|
681
|
+
return RowCountInfo(
|
|
682
|
+
source_name=result[0],
|
|
683
|
+
table_name=result[1],
|
|
684
|
+
row_count=result[2],
|
|
685
|
+
last_refreshed=result[3]
|
|
686
|
+
)
|
|
687
|
+
return None
|
|
688
|
+
|
|
689
|
+
def get_all_row_counts(self) -> List[RowCountInfo]:
|
|
690
|
+
"""
|
|
691
|
+
Get all cached row counts.
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
List of RowCountInfo objects
|
|
695
|
+
"""
|
|
696
|
+
results = self.conn.execute("""
|
|
697
|
+
SELECT source_name, table_name, row_count, last_refreshed
|
|
698
|
+
FROM row_counts
|
|
699
|
+
ORDER BY source_name, table_name
|
|
700
|
+
""").fetchall()
|
|
701
|
+
|
|
702
|
+
return [
|
|
703
|
+
RowCountInfo(
|
|
704
|
+
source_name=r[0],
|
|
705
|
+
table_name=r[1],
|
|
706
|
+
row_count=r[2],
|
|
707
|
+
last_refreshed=r[3]
|
|
708
|
+
)
|
|
709
|
+
for r in results
|
|
710
|
+
]
|
|
711
|
+
|
|
712
|
+
def clear_row_counts(self) -> None:
|
|
713
|
+
"""Clear all row count data."""
|
|
714
|
+
self.conn.execute("DELETE FROM row_counts")
|
|
715
|
+
|
|
716
|
+
def clear_snapshot(self) -> None:
|
|
717
|
+
"""Clear all snapshot data (both column metadata and row counts)."""
|
|
718
|
+
self.clear_column_metadata()
|
|
719
|
+
self.clear_row_counts()
|
|
720
|
+
|
|
721
|
+
def clear_all_metadata(self) -> None:
|
|
722
|
+
"""Clear ALL metadata from the store (columns, row counts, profiles)."""
|
|
723
|
+
self.clear_column_metadata()
|
|
724
|
+
self.clear_row_counts()
|
|
725
|
+
self.clear_profile_results()
|
|
726
|
+
# Note: catalog_nodes and lineage_edges are not cleared here
|
|
727
|
+
# as they're managed by dvt docs generate
|
|
728
|
+
|
|
729
|
+
def has_source_metadata(self) -> bool:
|
|
730
|
+
"""
|
|
731
|
+
Check if there is any source metadata in the store.
|
|
732
|
+
|
|
733
|
+
Used to determine if this is the first run (auto-snapshot needed).
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
True if source metadata exists, False otherwise
|
|
737
|
+
"""
|
|
738
|
+
result = self.conn.execute("""
|
|
739
|
+
SELECT COUNT(*) FROM column_metadata
|
|
740
|
+
WHERE source_name NOT LIKE 'model:%'
|
|
741
|
+
""").fetchone()[0]
|
|
742
|
+
return result > 0
|
|
743
|
+
|
|
744
|
+
def has_any_metadata(self) -> bool:
|
|
745
|
+
"""
|
|
746
|
+
Check if there is any metadata (sources or models) in the store.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
True if any metadata exists, False otherwise
|
|
750
|
+
"""
|
|
751
|
+
result = self.conn.execute(
|
|
752
|
+
"SELECT COUNT(*) FROM column_metadata"
|
|
753
|
+
).fetchone()[0]
|
|
754
|
+
return result > 0
|
|
755
|
+
|
|
756
|
+
# =========================================================================
|
|
757
|
+
# Legacy Compatibility - save_table_metadata with row_count
|
|
758
|
+
# =========================================================================
|
|
759
|
+
|
|
760
|
+
def save_table_metadata_with_row_count(
|
|
761
|
+
self,
|
|
762
|
+
source_name: str,
|
|
763
|
+
table_name: str,
|
|
764
|
+
adapter_name: str,
|
|
765
|
+
connection_name: str,
|
|
766
|
+
schema_name: str,
|
|
767
|
+
columns: List[ColumnMetadata],
|
|
768
|
+
row_count: Optional[int],
|
|
769
|
+
last_refreshed: datetime
|
|
770
|
+
) -> None:
|
|
771
|
+
"""
|
|
772
|
+
Save both column metadata and row count (used by dvt snap).
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
source_name: Name of the source
|
|
776
|
+
table_name: Name of the table
|
|
777
|
+
adapter_name: Name of the adapter
|
|
778
|
+
connection_name: Name of the connection
|
|
779
|
+
schema_name: Schema name
|
|
780
|
+
columns: List of ColumnMetadata
|
|
781
|
+
row_count: Number of rows (or None)
|
|
782
|
+
last_refreshed: Timestamp
|
|
783
|
+
"""
|
|
784
|
+
# Save column metadata
|
|
785
|
+
metadata = TableMetadata(
|
|
786
|
+
source_name=source_name,
|
|
787
|
+
table_name=table_name,
|
|
788
|
+
adapter_name=adapter_name,
|
|
789
|
+
connection_name=connection_name,
|
|
790
|
+
schema_name=schema_name,
|
|
791
|
+
columns=columns,
|
|
792
|
+
last_refreshed=last_refreshed
|
|
793
|
+
)
|
|
794
|
+
self.save_table_metadata(metadata)
|
|
795
|
+
|
|
796
|
+
# Save row count separately (only if provided)
|
|
797
|
+
if row_count is not None:
|
|
798
|
+
self.save_row_count(source_name, table_name, row_count, last_refreshed)
|
|
799
|
+
|
|
800
|
+
# =========================================================================
|
|
801
|
+
# Utility Methods
|
|
802
|
+
# =========================================================================
|
|
803
|
+
|
|
804
|
+
def exists(self) -> bool:
|
|
805
|
+
"""Check if the metadata store exists."""
|
|
806
|
+
return self.db_path.exists()
|
|
807
|
+
|
|
808
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
809
|
+
"""Get statistics about the metadata store."""
|
|
810
|
+
# Count column metadata
|
|
811
|
+
tables_count = self.conn.execute(
|
|
812
|
+
"SELECT COUNT(DISTINCT source_name || '.' || table_name) FROM column_metadata"
|
|
813
|
+
).fetchone()[0]
|
|
814
|
+
|
|
815
|
+
columns_count = self.conn.execute(
|
|
816
|
+
"SELECT COUNT(*) FROM column_metadata"
|
|
817
|
+
).fetchone()[0]
|
|
818
|
+
|
|
819
|
+
# Count row counts
|
|
820
|
+
row_counts_count = self.conn.execute(
|
|
821
|
+
"SELECT COUNT(*) FROM row_counts"
|
|
822
|
+
).fetchone()[0]
|
|
823
|
+
|
|
824
|
+
# Get registry stats
|
|
825
|
+
registry = self.registry
|
|
826
|
+
adapters = registry.get_supported_adapters()
|
|
827
|
+
|
|
828
|
+
return {
|
|
829
|
+
"metadata_tables": tables_count,
|
|
830
|
+
"metadata_columns": columns_count,
|
|
831
|
+
"row_counts_cached": row_counts_count,
|
|
832
|
+
"registry_adapters": len(adapters),
|
|
833
|
+
"supported_adapters": adapters,
|
|
834
|
+
"db_path": str(self.db_path),
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
def migrate_from_legacy(self) -> bool:
|
|
838
|
+
"""
|
|
839
|
+
Migrate from legacy metadata.duckdb format to new format.
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
True if migration was performed, False if not needed
|
|
843
|
+
"""
|
|
844
|
+
legacy_path = self.dvt_dir / "metadata.duckdb"
|
|
845
|
+
if not legacy_path.exists():
|
|
846
|
+
return False
|
|
847
|
+
|
|
848
|
+
# Check if new store already exists
|
|
849
|
+
if self.db_path.exists():
|
|
850
|
+
return False
|
|
851
|
+
|
|
852
|
+
try:
|
|
853
|
+
# Connect to legacy database
|
|
854
|
+
legacy_conn = duckdb.connect(str(legacy_path), read_only=True)
|
|
855
|
+
|
|
856
|
+
# Check if metadata_snapshot table exists
|
|
857
|
+
result = legacy_conn.execute("""
|
|
858
|
+
SELECT COUNT(*) FROM information_schema.tables
|
|
859
|
+
WHERE table_name = 'metadata_snapshot'
|
|
860
|
+
""").fetchone()[0]
|
|
861
|
+
|
|
862
|
+
if result == 0:
|
|
863
|
+
legacy_conn.close()
|
|
864
|
+
return False
|
|
865
|
+
|
|
866
|
+
# Initialize new store
|
|
867
|
+
self.initialize()
|
|
868
|
+
|
|
869
|
+
# Migrate metadata_snapshot to column_metadata
|
|
870
|
+
rows = legacy_conn.execute("""
|
|
871
|
+
SELECT DISTINCT
|
|
872
|
+
source_name, table_name, column_name, adapter_name, connection_name,
|
|
873
|
+
schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
|
|
874
|
+
ordinal_position, last_refreshed
|
|
875
|
+
FROM metadata_snapshot
|
|
876
|
+
""").fetchall()
|
|
877
|
+
|
|
878
|
+
for row in rows:
|
|
879
|
+
self.conn.execute("""
|
|
880
|
+
INSERT OR REPLACE INTO column_metadata
|
|
881
|
+
(source_name, table_name, column_name, adapter_name, connection_name,
|
|
882
|
+
schema_name, adapter_type, spark_type, is_nullable, is_primary_key,
|
|
883
|
+
ordinal_position, last_refreshed)
|
|
884
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
885
|
+
""", list(row))
|
|
886
|
+
|
|
887
|
+
# Migrate row_count data (distinct per table)
|
|
888
|
+
row_count_rows = legacy_conn.execute("""
|
|
889
|
+
SELECT DISTINCT source_name, table_name, row_count, MAX(last_refreshed)
|
|
890
|
+
FROM metadata_snapshot
|
|
891
|
+
WHERE row_count IS NOT NULL
|
|
892
|
+
GROUP BY source_name, table_name, row_count
|
|
893
|
+
""").fetchall()
|
|
894
|
+
|
|
895
|
+
for row in row_count_rows:
|
|
896
|
+
if row[2] is not None: # row_count
|
|
897
|
+
self.conn.execute("""
|
|
898
|
+
INSERT OR REPLACE INTO row_counts
|
|
899
|
+
(source_name, table_name, row_count, last_refreshed)
|
|
900
|
+
VALUES (?, ?, ?, ?)
|
|
901
|
+
""", list(row))
|
|
902
|
+
|
|
903
|
+
legacy_conn.close()
|
|
904
|
+
return True
|
|
905
|
+
|
|
906
|
+
except Exception as e:
|
|
907
|
+
print(f"[DVT] Warning: Migration failed: {e}")
|
|
908
|
+
return False
|
|
909
|
+
|
|
910
|
+
# =========================================================================
|
|
911
|
+
# Profile Results Operations (v0.56.0 - dvt profile command)
|
|
912
|
+
# =========================================================================
|
|
913
|
+
|
|
914
|
+
def save_profile_result(self, result: ColumnProfileResult) -> None:
|
|
915
|
+
"""
|
|
916
|
+
Save a column profile result to the store.
|
|
917
|
+
|
|
918
|
+
Args:
|
|
919
|
+
result: ColumnProfileResult object
|
|
920
|
+
"""
|
|
921
|
+
self.conn.execute("""
|
|
922
|
+
INSERT OR REPLACE INTO profile_results
|
|
923
|
+
(source_name, table_name, column_name, profile_mode,
|
|
924
|
+
row_count, null_count, null_percent, distinct_count, distinct_percent,
|
|
925
|
+
min_value, max_value, mean_value, median_value, stddev_value,
|
|
926
|
+
p25, p50, p75, min_length, max_length, avg_length,
|
|
927
|
+
histogram, top_values, alerts, profiled_at, duration_ms)
|
|
928
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
929
|
+
""", [
|
|
930
|
+
result.source_name, result.table_name, result.column_name, result.profile_mode,
|
|
931
|
+
result.row_count, result.null_count, result.null_percent,
|
|
932
|
+
result.distinct_count, result.distinct_percent,
|
|
933
|
+
result.min_value, result.max_value, result.mean_value,
|
|
934
|
+
result.median_value, result.stddev_value,
|
|
935
|
+
result.p25, result.p50, result.p75,
|
|
936
|
+
result.min_length, result.max_length, result.avg_length,
|
|
937
|
+
result.histogram, result.top_values, result.alerts,
|
|
938
|
+
result.profiled_at or datetime.now(), result.duration_ms
|
|
939
|
+
])
|
|
940
|
+
|
|
941
|
+
def save_profile_results_batch(self, results: List[ColumnProfileResult]) -> None:
|
|
942
|
+
"""
|
|
943
|
+
Save multiple profile results in a batch.
|
|
944
|
+
|
|
945
|
+
Args:
|
|
946
|
+
results: List of ColumnProfileResult objects
|
|
947
|
+
"""
|
|
948
|
+
for result in results:
|
|
949
|
+
self.save_profile_result(result)
|
|
950
|
+
|
|
951
|
+
def get_profile_result(
|
|
952
|
+
self,
|
|
953
|
+
source_name: str,
|
|
954
|
+
table_name: str,
|
|
955
|
+
column_name: str,
|
|
956
|
+
profile_mode: str
|
|
957
|
+
) -> Optional[ColumnProfileResult]:
|
|
958
|
+
"""
|
|
959
|
+
Get a profile result for a specific column.
|
|
960
|
+
|
|
961
|
+
Args:
|
|
962
|
+
source_name: Name of the source
|
|
963
|
+
table_name: Name of the table
|
|
964
|
+
column_name: Name of the column
|
|
965
|
+
profile_mode: Profile mode ('minimal', 'explorative', etc.)
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
ColumnProfileResult or None if not found
|
|
969
|
+
"""
|
|
970
|
+
result = self.conn.execute("""
|
|
971
|
+
SELECT source_name, table_name, column_name, profile_mode,
|
|
972
|
+
row_count, null_count, null_percent, distinct_count, distinct_percent,
|
|
973
|
+
min_value, max_value, mean_value, median_value, stddev_value,
|
|
974
|
+
p25, p50, p75, min_length, max_length, avg_length,
|
|
975
|
+
histogram, top_values, alerts, profiled_at, duration_ms
|
|
976
|
+
FROM profile_results
|
|
977
|
+
WHERE source_name = ? AND table_name = ? AND column_name = ? AND profile_mode = ?
|
|
978
|
+
""", [source_name, table_name, column_name, profile_mode]).fetchone()
|
|
979
|
+
|
|
980
|
+
if result:
|
|
981
|
+
return ColumnProfileResult(
|
|
982
|
+
source_name=result[0], table_name=result[1],
|
|
983
|
+
column_name=result[2], profile_mode=result[3],
|
|
984
|
+
row_count=result[4], null_count=result[5], null_percent=result[6],
|
|
985
|
+
distinct_count=result[7], distinct_percent=result[8],
|
|
986
|
+
min_value=result[9], max_value=result[10], mean_value=result[11],
|
|
987
|
+
median_value=result[12], stddev_value=result[13],
|
|
988
|
+
p25=result[14], p50=result[15], p75=result[16],
|
|
989
|
+
min_length=result[17], max_length=result[18], avg_length=result[19],
|
|
990
|
+
histogram=result[20], top_values=result[21], alerts=result[22],
|
|
991
|
+
profiled_at=result[23], duration_ms=result[24]
|
|
992
|
+
)
|
|
993
|
+
return None
|
|
994
|
+
|
|
995
|
+
def get_table_profile(
|
|
996
|
+
self,
|
|
997
|
+
source_name: str,
|
|
998
|
+
table_name: str,
|
|
999
|
+
profile_mode: Optional[str] = None
|
|
1000
|
+
) -> List[ColumnProfileResult]:
|
|
1001
|
+
"""
|
|
1002
|
+
Get all profile results for a table.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
source_name: Name of the source
|
|
1006
|
+
table_name: Name of the table
|
|
1007
|
+
profile_mode: Optional mode filter
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
List of ColumnProfileResult objects
|
|
1011
|
+
"""
|
|
1012
|
+
if profile_mode:
|
|
1013
|
+
results = self.conn.execute("""
|
|
1014
|
+
SELECT source_name, table_name, column_name, profile_mode,
|
|
1015
|
+
row_count, null_count, null_percent, distinct_count, distinct_percent,
|
|
1016
|
+
min_value, max_value, mean_value, median_value, stddev_value,
|
|
1017
|
+
p25, p50, p75, min_length, max_length, avg_length,
|
|
1018
|
+
histogram, top_values, alerts, profiled_at, duration_ms
|
|
1019
|
+
FROM profile_results
|
|
1020
|
+
WHERE source_name = ? AND table_name = ? AND profile_mode = ?
|
|
1021
|
+
ORDER BY column_name
|
|
1022
|
+
""", [source_name, table_name, profile_mode]).fetchall()
|
|
1023
|
+
else:
|
|
1024
|
+
results = self.conn.execute("""
|
|
1025
|
+
SELECT source_name, table_name, column_name, profile_mode,
|
|
1026
|
+
row_count, null_count, null_percent, distinct_count, distinct_percent,
|
|
1027
|
+
min_value, max_value, mean_value, median_value, stddev_value,
|
|
1028
|
+
p25, p50, p75, min_length, max_length, avg_length,
|
|
1029
|
+
histogram, top_values, alerts, profiled_at, duration_ms
|
|
1030
|
+
FROM profile_results
|
|
1031
|
+
WHERE source_name = ? AND table_name = ?
|
|
1032
|
+
ORDER BY column_name
|
|
1033
|
+
""", [source_name, table_name]).fetchall()
|
|
1034
|
+
|
|
1035
|
+
return [
|
|
1036
|
+
ColumnProfileResult(
|
|
1037
|
+
source_name=r[0], table_name=r[1], column_name=r[2], profile_mode=r[3],
|
|
1038
|
+
row_count=r[4], null_count=r[5], null_percent=r[6],
|
|
1039
|
+
distinct_count=r[7], distinct_percent=r[8],
|
|
1040
|
+
min_value=r[9], max_value=r[10], mean_value=r[11],
|
|
1041
|
+
median_value=r[12], stddev_value=r[13],
|
|
1042
|
+
p25=r[14], p50=r[15], p75=r[16],
|
|
1043
|
+
min_length=r[17], max_length=r[18], avg_length=r[19],
|
|
1044
|
+
histogram=r[20], top_values=r[21], alerts=r[22],
|
|
1045
|
+
profiled_at=r[23], duration_ms=r[24]
|
|
1046
|
+
)
|
|
1047
|
+
for r in results
|
|
1048
|
+
]
|
|
1049
|
+
|
|
1050
|
+
def get_all_profiled_tables(self) -> List[Tuple[str, str, str, datetime]]:
|
|
1051
|
+
"""
|
|
1052
|
+
Get all profiled tables with their latest profile timestamp.
|
|
1053
|
+
|
|
1054
|
+
Returns:
|
|
1055
|
+
List of (source_name, table_name, profile_mode, profiled_at) tuples
|
|
1056
|
+
"""
|
|
1057
|
+
results = self.conn.execute("""
|
|
1058
|
+
SELECT source_name, table_name, profile_mode, MAX(profiled_at) as last_profiled
|
|
1059
|
+
FROM profile_results
|
|
1060
|
+
GROUP BY source_name, table_name, profile_mode
|
|
1061
|
+
ORDER BY source_name, table_name
|
|
1062
|
+
""").fetchall()
|
|
1063
|
+
|
|
1064
|
+
return [(r[0], r[1], r[2], r[3]) for r in results]
|
|
1065
|
+
|
|
1066
|
+
def get_profile_alerts(self, source_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
1067
|
+
"""
|
|
1068
|
+
Get all profile alerts, optionally filtered by source.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
source_name: Optional source filter
|
|
1072
|
+
|
|
1073
|
+
Returns:
|
|
1074
|
+
List of alert dicts with source/table/column info
|
|
1075
|
+
"""
|
|
1076
|
+
import json
|
|
1077
|
+
|
|
1078
|
+
if source_name:
|
|
1079
|
+
results = self.conn.execute("""
|
|
1080
|
+
SELECT source_name, table_name, column_name, alerts
|
|
1081
|
+
FROM profile_results
|
|
1082
|
+
WHERE source_name = ? AND alerts IS NOT NULL
|
|
1083
|
+
""", [source_name]).fetchall()
|
|
1084
|
+
else:
|
|
1085
|
+
results = self.conn.execute("""
|
|
1086
|
+
SELECT source_name, table_name, column_name, alerts
|
|
1087
|
+
FROM profile_results
|
|
1088
|
+
WHERE alerts IS NOT NULL
|
|
1089
|
+
""").fetchall()
|
|
1090
|
+
|
|
1091
|
+
all_alerts = []
|
|
1092
|
+
for r in results:
|
|
1093
|
+
try:
|
|
1094
|
+
alerts = json.loads(r[3]) if r[3] else []
|
|
1095
|
+
for alert in alerts:
|
|
1096
|
+
alert["source_name"] = r[0]
|
|
1097
|
+
alert["table_name"] = r[1]
|
|
1098
|
+
alert["column_name"] = r[2]
|
|
1099
|
+
all_alerts.append(alert)
|
|
1100
|
+
except json.JSONDecodeError:
|
|
1101
|
+
pass
|
|
1102
|
+
|
|
1103
|
+
return all_alerts
|
|
1104
|
+
|
|
1105
|
+
def clear_profile_results(self, source_name: Optional[str] = None) -> None:
|
|
1106
|
+
"""
|
|
1107
|
+
Clear profile results, optionally for a specific source.
|
|
1108
|
+
|
|
1109
|
+
Args:
|
|
1110
|
+
source_name: Optional source filter
|
|
1111
|
+
"""
|
|
1112
|
+
if source_name:
|
|
1113
|
+
self.conn.execute("DELETE FROM profile_results WHERE source_name = ?", [source_name])
|
|
1114
|
+
else:
|
|
1115
|
+
self.conn.execute("DELETE FROM profile_results")
|
|
1116
|
+
|
|
1117
|
+
# =========================================================================
|
|
1118
|
+
# Catalog Node Operations (v0.56.0 - dvt docs generate enhancement)
|
|
1119
|
+
# =========================================================================
|
|
1120
|
+
|
|
1121
|
+
def save_catalog_node(self, node: CatalogNode) -> None:
|
|
1122
|
+
"""
|
|
1123
|
+
Save a catalog node to the store.
|
|
1124
|
+
|
|
1125
|
+
Args:
|
|
1126
|
+
node: CatalogNode object
|
|
1127
|
+
"""
|
|
1128
|
+
self.conn.execute("""
|
|
1129
|
+
INSERT OR REPLACE INTO catalog_nodes
|
|
1130
|
+
(unique_id, resource_type, name, schema_name, database,
|
|
1131
|
+
connection_name, adapter_type, description, icon_type, color_hex,
|
|
1132
|
+
materialized, tags, meta, columns, row_count, bytes_stored,
|
|
1133
|
+
created_at, updated_at)
|
|
1134
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1135
|
+
""", [
|
|
1136
|
+
node.unique_id, node.resource_type, node.name,
|
|
1137
|
+
node.schema_name, node.database,
|
|
1138
|
+
node.connection_name, node.adapter_type,
|
|
1139
|
+
node.description, node.icon_type, node.color_hex,
|
|
1140
|
+
node.materialized, node.tags, node.meta, node.columns,
|
|
1141
|
+
node.row_count, node.bytes_stored,
|
|
1142
|
+
node.created_at, node.updated_at or datetime.now()
|
|
1143
|
+
])
|
|
1144
|
+
|
|
1145
|
+
def save_catalog_nodes_batch(self, nodes: List[CatalogNode]) -> None:
|
|
1146
|
+
"""
|
|
1147
|
+
Save multiple catalog nodes in a batch.
|
|
1148
|
+
|
|
1149
|
+
Args:
|
|
1150
|
+
nodes: List of CatalogNode objects
|
|
1151
|
+
"""
|
|
1152
|
+
for node in nodes:
|
|
1153
|
+
self.save_catalog_node(node)
|
|
1154
|
+
|
|
1155
|
+
def get_catalog_node(self, unique_id: str) -> Optional[CatalogNode]:
|
|
1156
|
+
"""
|
|
1157
|
+
Get a catalog node by unique ID.
|
|
1158
|
+
|
|
1159
|
+
Args:
|
|
1160
|
+
unique_id: Unique node ID
|
|
1161
|
+
|
|
1162
|
+
Returns:
|
|
1163
|
+
CatalogNode or None if not found
|
|
1164
|
+
"""
|
|
1165
|
+
result = self.conn.execute("""
|
|
1166
|
+
SELECT unique_id, resource_type, name, schema_name, database,
|
|
1167
|
+
connection_name, adapter_type, description, icon_type, color_hex,
|
|
1168
|
+
materialized, tags, meta, columns, row_count, bytes_stored,
|
|
1169
|
+
created_at, updated_at
|
|
1170
|
+
FROM catalog_nodes
|
|
1171
|
+
WHERE unique_id = ?
|
|
1172
|
+
""", [unique_id]).fetchone()
|
|
1173
|
+
|
|
1174
|
+
if result:
|
|
1175
|
+
return CatalogNode(
|
|
1176
|
+
unique_id=result[0], resource_type=result[1], name=result[2],
|
|
1177
|
+
schema_name=result[3], database=result[4],
|
|
1178
|
+
connection_name=result[5], adapter_type=result[6],
|
|
1179
|
+
description=result[7], icon_type=result[8], color_hex=result[9],
|
|
1180
|
+
materialized=result[10], tags=result[11], meta=result[12],
|
|
1181
|
+
columns=result[13], row_count=result[14], bytes_stored=result[15],
|
|
1182
|
+
created_at=result[16], updated_at=result[17]
|
|
1183
|
+
)
|
|
1184
|
+
return None
|
|
1185
|
+
|
|
1186
|
+
def get_catalog_nodes_by_type(self, resource_type: str) -> List[CatalogNode]:
|
|
1187
|
+
"""
|
|
1188
|
+
Get all catalog nodes of a specific type.
|
|
1189
|
+
|
|
1190
|
+
Args:
|
|
1191
|
+
resource_type: Type filter ('model', 'source', etc.)
|
|
1192
|
+
|
|
1193
|
+
Returns:
|
|
1194
|
+
List of CatalogNode objects
|
|
1195
|
+
"""
|
|
1196
|
+
results = self.conn.execute("""
|
|
1197
|
+
SELECT unique_id, resource_type, name, schema_name, database,
|
|
1198
|
+
connection_name, adapter_type, description, icon_type, color_hex,
|
|
1199
|
+
materialized, tags, meta, columns, row_count, bytes_stored,
|
|
1200
|
+
created_at, updated_at
|
|
1201
|
+
FROM catalog_nodes
|
|
1202
|
+
WHERE resource_type = ?
|
|
1203
|
+
ORDER BY name
|
|
1204
|
+
""", [resource_type]).fetchall()
|
|
1205
|
+
|
|
1206
|
+
return [
|
|
1207
|
+
CatalogNode(
|
|
1208
|
+
unique_id=r[0], resource_type=r[1], name=r[2],
|
|
1209
|
+
schema_name=r[3], database=r[4],
|
|
1210
|
+
connection_name=r[5], adapter_type=r[6],
|
|
1211
|
+
description=r[7], icon_type=r[8], color_hex=r[9],
|
|
1212
|
+
materialized=r[10], tags=r[11], meta=r[12],
|
|
1213
|
+
columns=r[13], row_count=r[14], bytes_stored=r[15],
|
|
1214
|
+
created_at=r[16], updated_at=r[17]
|
|
1215
|
+
)
|
|
1216
|
+
for r in results
|
|
1217
|
+
]
|
|
1218
|
+
|
|
1219
|
+
def get_all_catalog_nodes(self) -> List[CatalogNode]:
|
|
1220
|
+
"""
|
|
1221
|
+
Get all catalog nodes.
|
|
1222
|
+
|
|
1223
|
+
Returns:
|
|
1224
|
+
List of CatalogNode objects
|
|
1225
|
+
"""
|
|
1226
|
+
results = self.conn.execute("""
|
|
1227
|
+
SELECT unique_id, resource_type, name, schema_name, database,
|
|
1228
|
+
connection_name, adapter_type, description, icon_type, color_hex,
|
|
1229
|
+
materialized, tags, meta, columns, row_count, bytes_stored,
|
|
1230
|
+
created_at, updated_at
|
|
1231
|
+
FROM catalog_nodes
|
|
1232
|
+
ORDER BY resource_type, name
|
|
1233
|
+
""").fetchall()
|
|
1234
|
+
|
|
1235
|
+
return [
|
|
1236
|
+
CatalogNode(
|
|
1237
|
+
unique_id=r[0], resource_type=r[1], name=r[2],
|
|
1238
|
+
schema_name=r[3], database=r[4],
|
|
1239
|
+
connection_name=r[5], adapter_type=r[6],
|
|
1240
|
+
description=r[7], icon_type=r[8], color_hex=r[9],
|
|
1241
|
+
materialized=r[10], tags=r[11], meta=r[12],
|
|
1242
|
+
columns=r[13], row_count=r[14], bytes_stored=r[15],
|
|
1243
|
+
created_at=r[16], updated_at=r[17]
|
|
1244
|
+
)
|
|
1245
|
+
for r in results
|
|
1246
|
+
]
|
|
1247
|
+
|
|
1248
|
+
def search_catalog_nodes(self, query: str) -> List[CatalogNode]:
|
|
1249
|
+
"""
|
|
1250
|
+
Search catalog nodes by name or description.
|
|
1251
|
+
|
|
1252
|
+
Args:
|
|
1253
|
+
query: Search query string
|
|
1254
|
+
|
|
1255
|
+
Returns:
|
|
1256
|
+
List of matching CatalogNode objects
|
|
1257
|
+
"""
|
|
1258
|
+
search_pattern = f"%{query}%"
|
|
1259
|
+
results = self.conn.execute("""
|
|
1260
|
+
SELECT unique_id, resource_type, name, schema_name, database,
|
|
1261
|
+
connection_name, adapter_type, description, icon_type, color_hex,
|
|
1262
|
+
materialized, tags, meta, columns, row_count, bytes_stored,
|
|
1263
|
+
created_at, updated_at
|
|
1264
|
+
FROM catalog_nodes
|
|
1265
|
+
WHERE name ILIKE ? OR description ILIKE ? OR unique_id ILIKE ?
|
|
1266
|
+
ORDER BY resource_type, name
|
|
1267
|
+
""", [search_pattern, search_pattern, search_pattern]).fetchall()
|
|
1268
|
+
|
|
1269
|
+
return [
|
|
1270
|
+
CatalogNode(
|
|
1271
|
+
unique_id=r[0], resource_type=r[1], name=r[2],
|
|
1272
|
+
schema_name=r[3], database=r[4],
|
|
1273
|
+
connection_name=r[5], adapter_type=r[6],
|
|
1274
|
+
description=r[7], icon_type=r[8], color_hex=r[9],
|
|
1275
|
+
materialized=r[10], tags=r[11], meta=r[12],
|
|
1276
|
+
columns=r[13], row_count=r[14], bytes_stored=r[15],
|
|
1277
|
+
created_at=r[16], updated_at=r[17]
|
|
1278
|
+
)
|
|
1279
|
+
for r in results
|
|
1280
|
+
]
|
|
1281
|
+
|
|
1282
|
+
def clear_catalog_nodes(self) -> None:
|
|
1283
|
+
"""Clear all catalog nodes."""
|
|
1284
|
+
self.conn.execute("DELETE FROM catalog_nodes")
|
|
1285
|
+
|
|
1286
|
+
# =========================================================================
|
|
1287
|
+
# Lineage Edge Operations (v0.56.0 - dvt docs generate enhancement)
|
|
1288
|
+
# =========================================================================
|
|
1289
|
+
|
|
1290
|
+
def save_lineage_edge(self, edge: LineageEdge) -> int:
|
|
1291
|
+
"""
|
|
1292
|
+
Save a lineage edge to the store.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
edge: LineageEdge object
|
|
1296
|
+
|
|
1297
|
+
Returns:
|
|
1298
|
+
ID of the inserted edge
|
|
1299
|
+
"""
|
|
1300
|
+
if edge.id:
|
|
1301
|
+
self.conn.execute("""
|
|
1302
|
+
INSERT OR REPLACE INTO lineage_edges
|
|
1303
|
+
(id, source_node_id, target_node_id, edge_type,
|
|
1304
|
+
is_cross_connection, source_connection, target_connection)
|
|
1305
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
1306
|
+
""", [
|
|
1307
|
+
edge.id, edge.source_node_id, edge.target_node_id, edge.edge_type,
|
|
1308
|
+
edge.is_cross_connection, edge.source_connection, edge.target_connection
|
|
1309
|
+
])
|
|
1310
|
+
return edge.id
|
|
1311
|
+
else:
|
|
1312
|
+
result = self.conn.execute("""
|
|
1313
|
+
INSERT INTO lineage_edges
|
|
1314
|
+
(source_node_id, target_node_id, edge_type,
|
|
1315
|
+
is_cross_connection, source_connection, target_connection)
|
|
1316
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1317
|
+
RETURNING id
|
|
1318
|
+
""", [
|
|
1319
|
+
edge.source_node_id, edge.target_node_id, edge.edge_type,
|
|
1320
|
+
edge.is_cross_connection, edge.source_connection, edge.target_connection
|
|
1321
|
+
]).fetchone()
|
|
1322
|
+
return result[0]
|
|
1323
|
+
|
|
1324
|
+
def save_lineage_edges_batch(self, edges: List[LineageEdge]) -> None:
|
|
1325
|
+
"""
|
|
1326
|
+
Save multiple lineage edges in a batch.
|
|
1327
|
+
|
|
1328
|
+
Args:
|
|
1329
|
+
edges: List of LineageEdge objects
|
|
1330
|
+
"""
|
|
1331
|
+
for edge in edges:
|
|
1332
|
+
self.save_lineage_edge(edge)
|
|
1333
|
+
|
|
1334
|
+
def get_lineage_edge(self, edge_id: int) -> Optional[LineageEdge]:
|
|
1335
|
+
"""
|
|
1336
|
+
Get a lineage edge by ID.
|
|
1337
|
+
|
|
1338
|
+
Args:
|
|
1339
|
+
edge_id: Edge ID
|
|
1340
|
+
|
|
1341
|
+
Returns:
|
|
1342
|
+
LineageEdge or None if not found
|
|
1343
|
+
"""
|
|
1344
|
+
result = self.conn.execute("""
|
|
1345
|
+
SELECT id, source_node_id, target_node_id, edge_type,
|
|
1346
|
+
is_cross_connection, source_connection, target_connection
|
|
1347
|
+
FROM lineage_edges
|
|
1348
|
+
WHERE id = ?
|
|
1349
|
+
""", [edge_id]).fetchone()
|
|
1350
|
+
|
|
1351
|
+
if result:
|
|
1352
|
+
return LineageEdge(
|
|
1353
|
+
id=result[0], source_node_id=result[1], target_node_id=result[2],
|
|
1354
|
+
edge_type=result[3], is_cross_connection=result[4],
|
|
1355
|
+
source_connection=result[5], target_connection=result[6]
|
|
1356
|
+
)
|
|
1357
|
+
return None
|
|
1358
|
+
|
|
1359
|
+
def get_upstream_edges(self, node_id: str) -> List[LineageEdge]:
|
|
1360
|
+
"""
|
|
1361
|
+
Get all edges where this node is the target (upstream dependencies).
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
node_id: Node unique ID
|
|
1365
|
+
|
|
1366
|
+
Returns:
|
|
1367
|
+
List of LineageEdge objects
|
|
1368
|
+
"""
|
|
1369
|
+
results = self.conn.execute("""
|
|
1370
|
+
SELECT id, source_node_id, target_node_id, edge_type,
|
|
1371
|
+
is_cross_connection, source_connection, target_connection
|
|
1372
|
+
FROM lineage_edges
|
|
1373
|
+
WHERE target_node_id = ?
|
|
1374
|
+
""", [node_id]).fetchall()
|
|
1375
|
+
|
|
1376
|
+
return [
|
|
1377
|
+
LineageEdge(
|
|
1378
|
+
id=r[0], source_node_id=r[1], target_node_id=r[2],
|
|
1379
|
+
edge_type=r[3], is_cross_connection=r[4],
|
|
1380
|
+
source_connection=r[5], target_connection=r[6]
|
|
1381
|
+
)
|
|
1382
|
+
for r in results
|
|
1383
|
+
]
|
|
1384
|
+
|
|
1385
|
+
def get_downstream_edges(self, node_id: str) -> List[LineageEdge]:
|
|
1386
|
+
"""
|
|
1387
|
+
Get all edges where this node is the source (downstream dependents).
|
|
1388
|
+
|
|
1389
|
+
Args:
|
|
1390
|
+
node_id: Node unique ID
|
|
1391
|
+
|
|
1392
|
+
Returns:
|
|
1393
|
+
List of LineageEdge objects
|
|
1394
|
+
"""
|
|
1395
|
+
results = self.conn.execute("""
|
|
1396
|
+
SELECT id, source_node_id, target_node_id, edge_type,
|
|
1397
|
+
is_cross_connection, source_connection, target_connection
|
|
1398
|
+
FROM lineage_edges
|
|
1399
|
+
WHERE source_node_id = ?
|
|
1400
|
+
""", [node_id]).fetchall()
|
|
1401
|
+
|
|
1402
|
+
return [
|
|
1403
|
+
LineageEdge(
|
|
1404
|
+
id=r[0], source_node_id=r[1], target_node_id=r[2],
|
|
1405
|
+
edge_type=r[3], is_cross_connection=r[4],
|
|
1406
|
+
source_connection=r[5], target_connection=r[6]
|
|
1407
|
+
)
|
|
1408
|
+
for r in results
|
|
1409
|
+
]
|
|
1410
|
+
|
|
1411
|
+
def get_all_lineage_edges(self) -> List[LineageEdge]:
|
|
1412
|
+
"""
|
|
1413
|
+
Get all lineage edges.
|
|
1414
|
+
|
|
1415
|
+
Returns:
|
|
1416
|
+
List of LineageEdge objects
|
|
1417
|
+
"""
|
|
1418
|
+
results = self.conn.execute("""
|
|
1419
|
+
SELECT id, source_node_id, target_node_id, edge_type,
|
|
1420
|
+
is_cross_connection, source_connection, target_connection
|
|
1421
|
+
FROM lineage_edges
|
|
1422
|
+
ORDER BY source_node_id, target_node_id
|
|
1423
|
+
""").fetchall()
|
|
1424
|
+
|
|
1425
|
+
return [
|
|
1426
|
+
LineageEdge(
|
|
1427
|
+
id=r[0], source_node_id=r[1], target_node_id=r[2],
|
|
1428
|
+
edge_type=r[3], is_cross_connection=r[4],
|
|
1429
|
+
source_connection=r[5], target_connection=r[6]
|
|
1430
|
+
)
|
|
1431
|
+
for r in results
|
|
1432
|
+
]
|
|
1433
|
+
|
|
1434
|
+
def get_cross_connection_edges(self) -> List[LineageEdge]:
|
|
1435
|
+
"""
|
|
1436
|
+
Get all edges that cross connection boundaries.
|
|
1437
|
+
|
|
1438
|
+
Returns:
|
|
1439
|
+
List of cross-connection LineageEdge objects
|
|
1440
|
+
"""
|
|
1441
|
+
results = self.conn.execute("""
|
|
1442
|
+
SELECT id, source_node_id, target_node_id, edge_type,
|
|
1443
|
+
is_cross_connection, source_connection, target_connection
|
|
1444
|
+
FROM lineage_edges
|
|
1445
|
+
WHERE is_cross_connection = TRUE
|
|
1446
|
+
ORDER BY source_node_id, target_node_id
|
|
1447
|
+
""").fetchall()
|
|
1448
|
+
|
|
1449
|
+
return [
|
|
1450
|
+
LineageEdge(
|
|
1451
|
+
id=r[0], source_node_id=r[1], target_node_id=r[2],
|
|
1452
|
+
edge_type=r[3], is_cross_connection=r[4],
|
|
1453
|
+
source_connection=r[5], target_connection=r[6]
|
|
1454
|
+
)
|
|
1455
|
+
for r in results
|
|
1456
|
+
]
|
|
1457
|
+
|
|
1458
|
+
def get_lineage_graph(self) -> Dict[str, Any]:
|
|
1459
|
+
"""
|
|
1460
|
+
Get the full lineage graph as a dict suitable for visualization.
|
|
1461
|
+
|
|
1462
|
+
Returns:
|
|
1463
|
+
Dict with 'nodes' and 'edges' keys
|
|
1464
|
+
"""
|
|
1465
|
+
nodes = self.get_all_catalog_nodes()
|
|
1466
|
+
edges = self.get_all_lineage_edges()
|
|
1467
|
+
|
|
1468
|
+
return {
|
|
1469
|
+
"nodes": [
|
|
1470
|
+
{
|
|
1471
|
+
"id": n.unique_id,
|
|
1472
|
+
"type": n.resource_type,
|
|
1473
|
+
"name": n.name,
|
|
1474
|
+
"connection": n.connection_name,
|
|
1475
|
+
"adapter": n.adapter_type,
|
|
1476
|
+
"icon": n.icon_type,
|
|
1477
|
+
"color": n.color_hex,
|
|
1478
|
+
}
|
|
1479
|
+
for n in nodes
|
|
1480
|
+
],
|
|
1481
|
+
"edges": [
|
|
1482
|
+
{
|
|
1483
|
+
"source": e.source_node_id,
|
|
1484
|
+
"target": e.target_node_id,
|
|
1485
|
+
"type": e.edge_type,
|
|
1486
|
+
"cross_connection": e.is_cross_connection,
|
|
1487
|
+
}
|
|
1488
|
+
for e in edges
|
|
1489
|
+
],
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
def clear_lineage_edges(self) -> None:
|
|
1493
|
+
"""Clear all lineage edges."""
|
|
1494
|
+
self.conn.execute("DELETE FROM lineage_edges")
|
|
1495
|
+
|
|
1496
|
+
def clear_catalog_and_lineage(self) -> None:
|
|
1497
|
+
"""Clear both catalog nodes and lineage edges."""
|
|
1498
|
+
self.clear_lineage_edges()
|
|
1499
|
+
self.clear_catalog_nodes()
|