dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
dbt/task/profile.py
ADDED
|
@@ -0,0 +1,1307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DVT Profile Task
|
|
3
|
+
|
|
4
|
+
Data profiling task with DAG-based execution for sources and models.
|
|
5
|
+
Works like 'dvt run' with full selector support and DVT compute rules.
|
|
6
|
+
|
|
7
|
+
v0.56.0: Initial implementation with 4 profiling modes.
|
|
8
|
+
v0.58.1: PipeRider-style profiling - fast SQL-based metrics instead of ydata-profiling.
|
|
9
|
+
|
|
10
|
+
Modes:
|
|
11
|
+
- explorative: Full profiling (distributions, patterns, correlations) [DEFAULT]
|
|
12
|
+
- minimal: Basic stats (null%, distinct%, min/max)
|
|
13
|
+
- sensitive: Redacted profiling (masks PII-like columns)
|
|
14
|
+
- time-series: Temporal analysis (ACF, PACF, seasonality)
|
|
15
|
+
|
|
16
|
+
PipeRider-Style Metrics (v0.58.1):
|
|
17
|
+
- row_count, column_count
|
|
18
|
+
- null_count, null_percent
|
|
19
|
+
- distinct_count, distinct_percent
|
|
20
|
+
- min, max, mean, median, stddev
|
|
21
|
+
- top_values (most frequent)
|
|
22
|
+
- data_type distribution
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import time
|
|
27
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from datetime import datetime
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Dict, List, Optional, Set
|
|
32
|
+
|
|
33
|
+
import click
|
|
34
|
+
|
|
35
|
+
# Try to import Rich for beautiful CLI output
|
|
36
|
+
try:
|
|
37
|
+
from rich.console import Console
|
|
38
|
+
from rich.progress import (
|
|
39
|
+
Progress,
|
|
40
|
+
TextColumn,
|
|
41
|
+
BarColumn,
|
|
42
|
+
MofNCompleteColumn,
|
|
43
|
+
TimeElapsedColumn,
|
|
44
|
+
SpinnerColumn,
|
|
45
|
+
TaskProgressColumn,
|
|
46
|
+
)
|
|
47
|
+
from rich.table import Table
|
|
48
|
+
from rich import box
|
|
49
|
+
from rich.panel import Panel
|
|
50
|
+
from rich.style import Style
|
|
51
|
+
from rich.live import Live
|
|
52
|
+
HAS_RICH = True
|
|
53
|
+
except ImportError:
|
|
54
|
+
HAS_RICH = False
|
|
55
|
+
|
|
56
|
+
from dbt.artifacts.schemas.run import RunStatus
|
|
57
|
+
from dbt.config.runtime import RuntimeConfig
|
|
58
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
59
|
+
from dbt.contracts.graph.nodes import SourceDefinition, ModelNode
|
|
60
|
+
from dbt.task.base import BaseTask
|
|
61
|
+
|
|
62
|
+
# Initialize Rich console
|
|
63
|
+
console = Console() if HAS_RICH else None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class ColumnProfile:
|
|
68
|
+
"""
|
|
69
|
+
Profile result for a single column (PipeRider-style metrics).
|
|
70
|
+
|
|
71
|
+
PipeRider Metric Names (exact copy from piperider_cli/profiler/profiler.py):
|
|
72
|
+
- total: Total row count in table
|
|
73
|
+
- samples: Number of sampled rows (same as total if no sampling)
|
|
74
|
+
- samples_p: Sampling percentage (1.0 = 100%)
|
|
75
|
+
- non_nulls: Count of non-null values
|
|
76
|
+
- non_nulls_p: Percentage of non-null values
|
|
77
|
+
- nulls: Count of null values
|
|
78
|
+
- nulls_p: Percentage of null values
|
|
79
|
+
- valids: Count of valid values (non-null, parseable)
|
|
80
|
+
- valids_p: Percentage of valid values
|
|
81
|
+
- invalids: Count of invalid values
|
|
82
|
+
- invalids_p: Percentage of invalid values
|
|
83
|
+
- distinct: Count of distinct values
|
|
84
|
+
- distinct_p: Percentage of distinct values
|
|
85
|
+
- duplicates: Count of duplicate values
|
|
86
|
+
- duplicates_p: Percentage of duplicate values
|
|
87
|
+
- non_duplicates: Count of non-duplicate (unique) values
|
|
88
|
+
- non_duplicates_p: Percentage of non-duplicate values
|
|
89
|
+
- min: Minimum value
|
|
90
|
+
- max: Maximum value
|
|
91
|
+
- sum: Sum (numeric only)
|
|
92
|
+
- avg: Average/mean (numeric only)
|
|
93
|
+
- stddev: Standard deviation (numeric only)
|
|
94
|
+
- p5, p25, p50, p75, p95: Percentiles (numeric only)
|
|
95
|
+
- zeros, zeros_p: Zero values (numeric only)
|
|
96
|
+
- negatives, negatives_p: Negative values (numeric only)
|
|
97
|
+
- positives, positives_p: Positive values (numeric only)
|
|
98
|
+
- min_length, max_length, avg_length: String length stats
|
|
99
|
+
- zero_length, zero_length_p: Empty strings
|
|
100
|
+
- topk: Top K values with counts
|
|
101
|
+
- histogram: Distribution histogram
|
|
102
|
+
"""
|
|
103
|
+
# Column identity
|
|
104
|
+
name: str # PipeRider uses 'name' not 'column_name'
|
|
105
|
+
type: str # PipeRider uses 'type' (generic: string, integer, numeric, datetime, boolean, other)
|
|
106
|
+
schema_type: str = "" # Original database type (VARCHAR, INTEGER, etc.)
|
|
107
|
+
|
|
108
|
+
# Core metrics (PipeRider exact names)
|
|
109
|
+
total: Optional[int] = None # Set from table row_count
|
|
110
|
+
samples: int = 0 # Number of sampled rows
|
|
111
|
+
samples_p: Optional[float] = None # Sampling percentage
|
|
112
|
+
|
|
113
|
+
# Null metrics
|
|
114
|
+
non_nulls: int = 0
|
|
115
|
+
non_nulls_p: Optional[float] = None
|
|
116
|
+
nulls: int = 0
|
|
117
|
+
nulls_p: Optional[float] = None
|
|
118
|
+
|
|
119
|
+
# Validity metrics
|
|
120
|
+
valids: int = 0
|
|
121
|
+
valids_p: Optional[float] = None
|
|
122
|
+
invalids: int = 0
|
|
123
|
+
invalids_p: Optional[float] = None
|
|
124
|
+
|
|
125
|
+
# Distinct/uniqueness metrics
|
|
126
|
+
distinct: int = 0
|
|
127
|
+
distinct_p: Optional[float] = None
|
|
128
|
+
duplicates: int = 0
|
|
129
|
+
duplicates_p: Optional[float] = None
|
|
130
|
+
non_duplicates: int = 0
|
|
131
|
+
non_duplicates_p: Optional[float] = None
|
|
132
|
+
|
|
133
|
+
# Numeric statistics
|
|
134
|
+
min: Optional[float] = None
|
|
135
|
+
max: Optional[float] = None
|
|
136
|
+
sum: Optional[float] = None
|
|
137
|
+
avg: Optional[float] = None
|
|
138
|
+
stddev: Optional[float] = None
|
|
139
|
+
|
|
140
|
+
# Percentiles (numeric)
|
|
141
|
+
p5: Optional[float] = None
|
|
142
|
+
p25: Optional[float] = None
|
|
143
|
+
p50: Optional[float] = None
|
|
144
|
+
p75: Optional[float] = None
|
|
145
|
+
p95: Optional[float] = None
|
|
146
|
+
|
|
147
|
+
# Numeric sign distribution
|
|
148
|
+
zeros: int = 0
|
|
149
|
+
zeros_p: Optional[float] = None
|
|
150
|
+
negatives: int = 0
|
|
151
|
+
negatives_p: Optional[float] = None
|
|
152
|
+
positives: int = 0
|
|
153
|
+
positives_p: Optional[float] = None
|
|
154
|
+
|
|
155
|
+
# String length metrics
|
|
156
|
+
min_length: Optional[int] = None
|
|
157
|
+
max_length: Optional[int] = None
|
|
158
|
+
avg_length: Optional[float] = None
|
|
159
|
+
stddev_length: Optional[float] = None
|
|
160
|
+
zero_length: int = 0
|
|
161
|
+
zero_length_p: Optional[float] = None
|
|
162
|
+
non_zero_length: int = 0
|
|
163
|
+
non_zero_length_p: Optional[float] = None
|
|
164
|
+
|
|
165
|
+
# Boolean metrics
|
|
166
|
+
trues: int = 0
|
|
167
|
+
trues_p: Optional[float] = None
|
|
168
|
+
falses: int = 0
|
|
169
|
+
falses_p: Optional[float] = None
|
|
170
|
+
|
|
171
|
+
# Distribution data (PipeRider format)
|
|
172
|
+
topk: Optional[Dict] = None # {"values": [...], "counts": [...]}
|
|
173
|
+
histogram: Optional[Dict] = None # {"labels": [...], "counts": [...], "bin_edges": [...]}
|
|
174
|
+
histogram_length: Optional[Dict] = None # For string length distribution
|
|
175
|
+
|
|
176
|
+
# Quality alerts (PipeRider format)
|
|
177
|
+
alerts: List[Dict] = field(default_factory=list)
|
|
178
|
+
|
|
179
|
+
# Profiling metadata
|
|
180
|
+
profile_duration: Optional[str] = None # "1.23" seconds
|
|
181
|
+
elapsed_milli: int = 0 # Duration in milliseconds
|
|
182
|
+
|
|
183
|
+
# Legacy aliases for backward compatibility
|
|
184
|
+
@property
|
|
185
|
+
def column_name(self) -> str:
|
|
186
|
+
return self.name
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def data_type(self) -> str:
|
|
190
|
+
return self.type
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def row_count(self) -> int:
|
|
194
|
+
return self.samples
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def null_count(self) -> int:
|
|
198
|
+
return self.nulls
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def null_percent(self) -> float:
|
|
202
|
+
return (self.nulls_p or 0.0) * 100
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def distinct_count(self) -> int:
|
|
206
|
+
return self.distinct
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def distinct_percent(self) -> float:
|
|
210
|
+
return (self.distinct_p or 0.0) * 100
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def duration_ms(self) -> int:
|
|
214
|
+
return self.elapsed_milli
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class TableProfile:
|
|
219
|
+
"""Profile result for a table."""
|
|
220
|
+
source_name: str
|
|
221
|
+
table_name: str
|
|
222
|
+
connection_name: str
|
|
223
|
+
row_count: int
|
|
224
|
+
column_count: int
|
|
225
|
+
columns: List[ColumnProfile]
|
|
226
|
+
profile_mode: str
|
|
227
|
+
profiled_at: datetime
|
|
228
|
+
duration_ms: int
|
|
229
|
+
alerts: List[Dict] = field(default_factory=list)
|
|
230
|
+
status: str = "success"
|
|
231
|
+
error: Optional[str] = None
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@dataclass
|
|
235
|
+
class ProfileExecutionResult:
|
|
236
|
+
"""Result of profile execution."""
|
|
237
|
+
tables_profiled: int = 0
|
|
238
|
+
total_rows: int = 0
|
|
239
|
+
total_columns: int = 0
|
|
240
|
+
total_alerts: int = 0
|
|
241
|
+
duration_ms: int = 0
|
|
242
|
+
profiles: List[TableProfile] = field(default_factory=list)
|
|
243
|
+
errors: List[str] = field(default_factory=list)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class ProfileTask(BaseTask):
|
|
247
|
+
"""
|
|
248
|
+
DAG-based profiling task for DVT (PipeRider-style).
|
|
249
|
+
|
|
250
|
+
v0.58.1: Uses fast SQL-based profiling queries instead of slow ydata-profiling.
|
|
251
|
+
|
|
252
|
+
Execution flow:
|
|
253
|
+
1. Parse selectors (--select, --exclude)
|
|
254
|
+
2. Build execution list (sources + models)
|
|
255
|
+
3. For each node:
|
|
256
|
+
a. Execute efficient SQL profiling queries
|
|
257
|
+
b. Collect PipeRider-style metrics
|
|
258
|
+
c. Store results in metadata_store.duckdb
|
|
259
|
+
4. Display summary (PipeRider-style)
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
def __init__(
|
|
263
|
+
self,
|
|
264
|
+
flags: Any,
|
|
265
|
+
runtime_config: RuntimeConfig,
|
|
266
|
+
manifest: Manifest,
|
|
267
|
+
):
|
|
268
|
+
super().__init__(flags) # BaseTask only takes flags, sets self.args
|
|
269
|
+
self.runtime_config = runtime_config
|
|
270
|
+
self.manifest = manifest
|
|
271
|
+
self.profile_mode = getattr(self.args, "MODE", "explorative") or "explorative"
|
|
272
|
+
self._threads = getattr(self.args, "THREADS", 4) or 4
|
|
273
|
+
|
|
274
|
+
def run(self) -> ProfileExecutionResult:
|
|
275
|
+
"""Execute profiling on selected sources and models."""
|
|
276
|
+
start_time = time.time()
|
|
277
|
+
result = ProfileExecutionResult()
|
|
278
|
+
|
|
279
|
+
# Print header with Rich Panel
|
|
280
|
+
if HAS_RICH:
|
|
281
|
+
console.print()
|
|
282
|
+
header_panel = Panel(
|
|
283
|
+
f"[bold cyan]Mode:[/bold cyan] [yellow]{self.profile_mode}[/yellow] | "
|
|
284
|
+
f"[bold cyan]Threads:[/bold cyan] [yellow]{self._threads}[/yellow]",
|
|
285
|
+
title="[bold magenta]DVT Profile - Data Profiling[/bold magenta]",
|
|
286
|
+
subtitle="[dim]PipeRider-style fast SQL profiling[/dim]",
|
|
287
|
+
border_style="magenta",
|
|
288
|
+
box=box.DOUBLE,
|
|
289
|
+
)
|
|
290
|
+
console.print(header_panel)
|
|
291
|
+
console.print()
|
|
292
|
+
else:
|
|
293
|
+
print("\n" + "=" * 60)
|
|
294
|
+
print(" DVT Profile - Data Profiling")
|
|
295
|
+
print(f" Mode: {self.profile_mode} | Threads: {self._threads}")
|
|
296
|
+
print("=" * 60 + "\n")
|
|
297
|
+
|
|
298
|
+
# Get selected nodes
|
|
299
|
+
nodes = self._get_selected_nodes()
|
|
300
|
+
|
|
301
|
+
if not nodes:
|
|
302
|
+
if HAS_RICH:
|
|
303
|
+
console.print("[yellow]No sources or models selected for profiling.[/yellow]")
|
|
304
|
+
console.print("[dim]Use --select to specify targets, e.g.: dvt profile run --select 'source:*'[/dim]")
|
|
305
|
+
else:
|
|
306
|
+
print("No sources or models selected for profiling.")
|
|
307
|
+
return result
|
|
308
|
+
|
|
309
|
+
# Profile with progress display
|
|
310
|
+
if HAS_RICH:
|
|
311
|
+
result = self._profile_with_progress(nodes, result)
|
|
312
|
+
else:
|
|
313
|
+
result = self._profile_without_progress(nodes, result)
|
|
314
|
+
|
|
315
|
+
# Calculate duration
|
|
316
|
+
result.duration_ms = int((time.time() - start_time) * 1000)
|
|
317
|
+
|
|
318
|
+
# Print summary
|
|
319
|
+
self._print_summary(result)
|
|
320
|
+
|
|
321
|
+
return result
|
|
322
|
+
|
|
323
|
+
def _profile_with_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
|
|
324
|
+
"""Profile nodes with Rich progress display."""
|
|
325
|
+
with Progress(
|
|
326
|
+
SpinnerColumn(),
|
|
327
|
+
TextColumn("[bold blue]{task.description}"),
|
|
328
|
+
BarColumn(bar_width=40),
|
|
329
|
+
TaskProgressColumn(),
|
|
330
|
+
MofNCompleteColumn(),
|
|
331
|
+
TimeElapsedColumn(),
|
|
332
|
+
console=console,
|
|
333
|
+
) as progress:
|
|
334
|
+
main_task = progress.add_task("[cyan]Profiling...", total=len(nodes))
|
|
335
|
+
|
|
336
|
+
# Profile each node
|
|
337
|
+
for i, node in enumerate(nodes, 1):
|
|
338
|
+
node_name = self._get_node_display_name(node)
|
|
339
|
+
progress.update(main_task, description=f"[cyan]Profiling[/cyan] [bold]{node_name}[/bold]")
|
|
340
|
+
|
|
341
|
+
profile = self._profile_node(node, i, len(nodes))
|
|
342
|
+
if profile:
|
|
343
|
+
result.profiles.append(profile)
|
|
344
|
+
result.tables_profiled += 1
|
|
345
|
+
result.total_rows += profile.row_count
|
|
346
|
+
result.total_columns += profile.column_count
|
|
347
|
+
result.total_alerts += len(profile.alerts)
|
|
348
|
+
for col in profile.columns:
|
|
349
|
+
result.total_alerts += len(col.alerts)
|
|
350
|
+
|
|
351
|
+
# Store in metadata_store.duckdb
|
|
352
|
+
self._store_profile(profile)
|
|
353
|
+
|
|
354
|
+
# Show result line
|
|
355
|
+
status_icon = "[green]OK[/green]" if profile.status == "success" else "[red]FAIL[/red]"
|
|
356
|
+
console.print(
|
|
357
|
+
f" {status_icon} {node_name} "
|
|
358
|
+
f"[dim]({profile.row_count:,} rows, {profile.column_count} cols, {profile.duration_ms}ms)[/dim]"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
progress.advance(main_task)
|
|
362
|
+
|
|
363
|
+
return result
|
|
364
|
+
|
|
365
|
+
def _profile_without_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
|
|
366
|
+
"""Profile nodes without Rich (fallback)."""
|
|
367
|
+
for i, node in enumerate(nodes, 1):
|
|
368
|
+
node_name = self._get_node_display_name(node)
|
|
369
|
+
print(f" [{i}/{len(nodes)}] Profiling {node_name}...")
|
|
370
|
+
|
|
371
|
+
profile = self._profile_node(node, i, len(nodes))
|
|
372
|
+
if profile:
|
|
373
|
+
result.profiles.append(profile)
|
|
374
|
+
result.tables_profiled += 1
|
|
375
|
+
result.total_rows += profile.row_count
|
|
376
|
+
result.total_columns += profile.column_count
|
|
377
|
+
|
|
378
|
+
self._store_profile(profile)
|
|
379
|
+
|
|
380
|
+
status = "OK" if profile.status == "success" else "FAIL"
|
|
381
|
+
print(f" {status} ({profile.row_count:,} rows, {profile.column_count} cols)")
|
|
382
|
+
|
|
383
|
+
return result
|
|
384
|
+
|
|
385
|
+
def _get_selected_nodes(self) -> List[Any]:
|
|
386
|
+
"""Get list of nodes to profile based on selectors."""
|
|
387
|
+
nodes = []
|
|
388
|
+
|
|
389
|
+
# If no selection, default to all sources
|
|
390
|
+
selector = getattr(self.args, "SELECT", None)
|
|
391
|
+
exclude = getattr(self.args, "EXCLUDE", None)
|
|
392
|
+
|
|
393
|
+
if not selector:
|
|
394
|
+
# Default: profile all sources
|
|
395
|
+
for source_id, source in self.manifest.sources.items():
|
|
396
|
+
nodes.append(source)
|
|
397
|
+
else:
|
|
398
|
+
# Parse selection
|
|
399
|
+
for sel in selector:
|
|
400
|
+
if isinstance(sel, tuple):
|
|
401
|
+
for s in sel:
|
|
402
|
+
nodes.extend(self._parse_selector(s))
|
|
403
|
+
else:
|
|
404
|
+
nodes.extend(self._parse_selector(sel))
|
|
405
|
+
|
|
406
|
+
# Apply exclusions
|
|
407
|
+
if exclude:
|
|
408
|
+
excluded = set()
|
|
409
|
+
for exc in exclude:
|
|
410
|
+
if isinstance(exc, tuple):
|
|
411
|
+
for e in exc:
|
|
412
|
+
excluded.update(self._get_excluded_ids(e))
|
|
413
|
+
else:
|
|
414
|
+
excluded.update(self._get_excluded_ids(exc))
|
|
415
|
+
nodes = [n for n in nodes if self._get_node_id(n) not in excluded]
|
|
416
|
+
|
|
417
|
+
return nodes
|
|
418
|
+
|
|
419
|
+
def _parse_selector(self, selector: str) -> List[Any]:
|
|
420
|
+
"""Parse a selector string into nodes."""
|
|
421
|
+
nodes = []
|
|
422
|
+
|
|
423
|
+
if selector.startswith("source:"):
|
|
424
|
+
# Source selector: source:* or source:postgres.*
|
|
425
|
+
pattern = selector[7:] # Remove "source:" prefix
|
|
426
|
+
for source_id, source in self.manifest.sources.items():
|
|
427
|
+
if self._matches_pattern(source, pattern):
|
|
428
|
+
nodes.append(source)
|
|
429
|
+
|
|
430
|
+
elif selector.startswith("model:"):
|
|
431
|
+
# Model selector: model:* or model:staging.*
|
|
432
|
+
pattern = selector[6:] # Remove "model:" prefix
|
|
433
|
+
for node_id, node in self.manifest.nodes.items():
|
|
434
|
+
if hasattr(node, "resource_type") and node.resource_type.value == "model":
|
|
435
|
+
if self._matches_pattern(node, pattern):
|
|
436
|
+
nodes.append(node)
|
|
437
|
+
|
|
438
|
+
elif "*" in selector:
|
|
439
|
+
# Wildcard - match both sources and models
|
|
440
|
+
pattern = selector
|
|
441
|
+
for source_id, source in self.manifest.sources.items():
|
|
442
|
+
if self._matches_pattern(source, pattern):
|
|
443
|
+
nodes.append(source)
|
|
444
|
+
for node_id, node in self.manifest.nodes.items():
|
|
445
|
+
if hasattr(node, "resource_type") and node.resource_type.value == "model":
|
|
446
|
+
if self._matches_pattern(node, pattern):
|
|
447
|
+
nodes.append(node)
|
|
448
|
+
|
|
449
|
+
else:
|
|
450
|
+
# Exact match by name
|
|
451
|
+
for source_id, source in self.manifest.sources.items():
|
|
452
|
+
if source.name == selector or source.identifier == selector:
|
|
453
|
+
nodes.append(source)
|
|
454
|
+
for node_id, node in self.manifest.nodes.items():
|
|
455
|
+
if hasattr(node, "name") and node.name == selector:
|
|
456
|
+
nodes.append(node)
|
|
457
|
+
|
|
458
|
+
return nodes
|
|
459
|
+
|
|
460
|
+
def _matches_pattern(self, node: Any, pattern: str) -> bool:
|
|
461
|
+
"""Check if a node matches a glob pattern."""
|
|
462
|
+
import fnmatch
|
|
463
|
+
|
|
464
|
+
if pattern == "*":
|
|
465
|
+
return True
|
|
466
|
+
|
|
467
|
+
name = getattr(node, "name", "")
|
|
468
|
+
identifier = getattr(node, "identifier", name)
|
|
469
|
+
source_name = getattr(node, "source_name", "")
|
|
470
|
+
unique_id = getattr(node, "unique_id", "")
|
|
471
|
+
|
|
472
|
+
# Try matching against different attributes
|
|
473
|
+
full_name = f"{source_name}.{identifier}" if source_name else identifier
|
|
474
|
+
|
|
475
|
+
# Extract just the source_name.table portion from unique_id
|
|
476
|
+
# unique_id format: source.project_name.source_name.table_name
|
|
477
|
+
# We want to match against: project_name.source_name.table_name
|
|
478
|
+
parts = unique_id.split(".")
|
|
479
|
+
if len(parts) >= 4 and parts[0] == "source":
|
|
480
|
+
# project_name.source_name.table_name
|
|
481
|
+
project_source_table = ".".join(parts[1:])
|
|
482
|
+
source_table = ".".join(parts[2:]) # source_name.table_name
|
|
483
|
+
else:
|
|
484
|
+
project_source_table = unique_id
|
|
485
|
+
source_table = full_name
|
|
486
|
+
|
|
487
|
+
return (
|
|
488
|
+
fnmatch.fnmatch(name, pattern) or
|
|
489
|
+
fnmatch.fnmatch(identifier, pattern) or
|
|
490
|
+
fnmatch.fnmatch(full_name, pattern) or
|
|
491
|
+
fnmatch.fnmatch(project_source_table, pattern) or
|
|
492
|
+
fnmatch.fnmatch(source_table, pattern) or
|
|
493
|
+
fnmatch.fnmatch(unique_id, pattern)
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
def _get_excluded_ids(self, exclude_str: str) -> Set[str]:
|
|
497
|
+
"""Get IDs of nodes matching exclusion pattern."""
|
|
498
|
+
ids = set()
|
|
499
|
+
nodes = self._parse_selector(exclude_str)
|
|
500
|
+
for node in nodes:
|
|
501
|
+
ids.add(self._get_node_id(node))
|
|
502
|
+
return ids
|
|
503
|
+
|
|
504
|
+
def _get_node_id(self, node: Any) -> str:
|
|
505
|
+
"""Get unique ID for a node."""
|
|
506
|
+
if hasattr(node, "unique_id"):
|
|
507
|
+
return node.unique_id
|
|
508
|
+
return getattr(node, "name", str(node))
|
|
509
|
+
|
|
510
|
+
def _get_node_display_name(self, node: Any) -> str:
|
|
511
|
+
"""Get display name for a node."""
|
|
512
|
+
if isinstance(node, SourceDefinition):
|
|
513
|
+
return f"{node.source_name}.{node.identifier}"
|
|
514
|
+
else:
|
|
515
|
+
return getattr(node, "name", str(node))
|
|
516
|
+
|
|
517
|
+
def _profile_node(self, node: Any, index: int, total: int) -> Optional[TableProfile]:
|
|
518
|
+
"""Profile a single node (source or model)."""
|
|
519
|
+
start_time = time.time()
|
|
520
|
+
|
|
521
|
+
# Get node info
|
|
522
|
+
if isinstance(node, SourceDefinition):
|
|
523
|
+
source_name = node.source_name
|
|
524
|
+
table_name = node.identifier
|
|
525
|
+
connection_name = getattr(node, "config", {}).get("target", "default")
|
|
526
|
+
node_type = "source"
|
|
527
|
+
else:
|
|
528
|
+
source_name = "models"
|
|
529
|
+
table_name = node.name
|
|
530
|
+
connection_name = getattr(node.config, "target", "default") if hasattr(node, "config") else "default"
|
|
531
|
+
node_type = "model"
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
# Execute profiling
|
|
535
|
+
columns = self._execute_profile(node)
|
|
536
|
+
|
|
537
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
538
|
+
|
|
539
|
+
# Calculate totals
|
|
540
|
+
row_count = columns[0].row_count if columns else 0
|
|
541
|
+
|
|
542
|
+
# Collect alerts
|
|
543
|
+
alerts = []
|
|
544
|
+
for col in columns:
|
|
545
|
+
alerts.extend(col.alerts)
|
|
546
|
+
|
|
547
|
+
profile = TableProfile(
|
|
548
|
+
source_name=source_name,
|
|
549
|
+
table_name=table_name,
|
|
550
|
+
connection_name=connection_name,
|
|
551
|
+
row_count=row_count,
|
|
552
|
+
column_count=len(columns),
|
|
553
|
+
columns=columns,
|
|
554
|
+
profile_mode=self.profile_mode,
|
|
555
|
+
profiled_at=datetime.now(),
|
|
556
|
+
duration_ms=duration_ms,
|
|
557
|
+
alerts=alerts,
|
|
558
|
+
status="success",
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
return profile
|
|
562
|
+
|
|
563
|
+
except Exception as e:
|
|
564
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
565
|
+
return TableProfile(
|
|
566
|
+
source_name=source_name,
|
|
567
|
+
table_name=table_name,
|
|
568
|
+
connection_name=connection_name,
|
|
569
|
+
row_count=0,
|
|
570
|
+
column_count=0,
|
|
571
|
+
columns=[],
|
|
572
|
+
profile_mode=self.profile_mode,
|
|
573
|
+
profiled_at=datetime.now(),
|
|
574
|
+
duration_ms=duration_ms,
|
|
575
|
+
status="error",
|
|
576
|
+
error=str(e),
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
def _execute_profile(self, node: Any) -> List[ColumnProfile]:
|
|
580
|
+
"""
|
|
581
|
+
Execute PipeRider-style profiling queries on a node.
|
|
582
|
+
|
|
583
|
+
Uses efficient SQL queries to compute:
|
|
584
|
+
- row_count, null_count, distinct_count
|
|
585
|
+
- min, max, mean, stddev (numeric)
|
|
586
|
+
- min_length, max_length, avg_length (string)
|
|
587
|
+
- top_values (categorical)
|
|
588
|
+
"""
|
|
589
|
+
columns = []
|
|
590
|
+
|
|
591
|
+
# Get table info
|
|
592
|
+
if isinstance(node, SourceDefinition):
|
|
593
|
+
schema = node.schema
|
|
594
|
+
table = node.identifier
|
|
595
|
+
database = getattr(node, "database", None)
|
|
596
|
+
target_name = node.config.get("target") if hasattr(node, "config") else None
|
|
597
|
+
else:
|
|
598
|
+
schema = node.schema
|
|
599
|
+
table = node.alias or node.name
|
|
600
|
+
database = getattr(node, "database", None)
|
|
601
|
+
target_name = getattr(node.config, "target", None) if hasattr(node, "config") else None
|
|
602
|
+
|
|
603
|
+
# Get adapter for connection
|
|
604
|
+
adapter = self._get_adapter(target_name)
|
|
605
|
+
|
|
606
|
+
# Get column info - either from node definition or by querying database
|
|
607
|
+
node_columns = getattr(node, "columns", {})
|
|
608
|
+
|
|
609
|
+
if not node_columns:
|
|
610
|
+
# Query database for column info
|
|
611
|
+
column_info = self._get_columns_from_db(adapter, database, schema, table)
|
|
612
|
+
else:
|
|
613
|
+
column_info = [
|
|
614
|
+
(col_name, getattr(col_info, "data_type", "VARCHAR") or "VARCHAR")
|
|
615
|
+
for col_name, col_info in node_columns.items()
|
|
616
|
+
]
|
|
617
|
+
|
|
618
|
+
if not column_info:
|
|
619
|
+
# Fallback: profile as single row count only
|
|
620
|
+
row_count = self._get_row_count(adapter, database, schema, table)
|
|
621
|
+
return [ColumnProfile(
|
|
622
|
+
name="_table_",
|
|
623
|
+
type="TABLE",
|
|
624
|
+
schema_type="TABLE",
|
|
625
|
+
total=row_count,
|
|
626
|
+
samples=row_count,
|
|
627
|
+
)]
|
|
628
|
+
|
|
629
|
+
# Get row count once for all columns
|
|
630
|
+
row_count = self._get_row_count(adapter, database, schema, table)
|
|
631
|
+
|
|
632
|
+
# Profile columns in parallel using threads
|
|
633
|
+
if self._threads > 1 and len(column_info) > 1:
|
|
634
|
+
with ThreadPoolExecutor(max_workers=min(self._threads, len(column_info))) as executor:
|
|
635
|
+
futures = {
|
|
636
|
+
executor.submit(
|
|
637
|
+
self._profile_column_sql,
|
|
638
|
+
adapter, database, schema, table,
|
|
639
|
+
col_name, col_type, row_count
|
|
640
|
+
): (col_name, col_type)
|
|
641
|
+
for col_name, col_type in column_info
|
|
642
|
+
}
|
|
643
|
+
for future in as_completed(futures):
|
|
644
|
+
try:
|
|
645
|
+
profile = future.result()
|
|
646
|
+
columns.append(profile)
|
|
647
|
+
except Exception as e:
|
|
648
|
+
col_name, col_type = futures[future]
|
|
649
|
+
columns.append(ColumnProfile(
|
|
650
|
+
name=col_name,
|
|
651
|
+
type=self._classify_type(col_type),
|
|
652
|
+
schema_type=col_type,
|
|
653
|
+
total=row_count,
|
|
654
|
+
samples=row_count,
|
|
655
|
+
alerts=[{"type": "PROFILE_ERROR", "severity": "warning", "message": str(e)[:100]}]
|
|
656
|
+
))
|
|
657
|
+
else:
|
|
658
|
+
# Sequential profiling
|
|
659
|
+
for col_name, col_type in column_info:
|
|
660
|
+
profile = self._profile_column_sql(
|
|
661
|
+
adapter, database, schema, table,
|
|
662
|
+
col_name, col_type, row_count
|
|
663
|
+
)
|
|
664
|
+
columns.append(profile)
|
|
665
|
+
|
|
666
|
+
return columns
|
|
667
|
+
|
|
668
|
+
def _get_adapter(self, target_name: Optional[str] = None):
|
|
669
|
+
"""Get adapter for the specified target or default."""
|
|
670
|
+
from dbt.adapters.factory import get_adapter
|
|
671
|
+
|
|
672
|
+
# Get adapter from runtime config
|
|
673
|
+
adapter = get_adapter(self.runtime_config)
|
|
674
|
+
return adapter
|
|
675
|
+
|
|
676
|
+
def _get_columns_from_db(
|
|
677
|
+
self, adapter, database: Optional[str], schema: str, table: str
|
|
678
|
+
) -> List[tuple]:
|
|
679
|
+
"""Query database to get column names and types."""
|
|
680
|
+
try:
|
|
681
|
+
# Use adapter's get_columns_in_relation
|
|
682
|
+
from dbt.adapters.base import BaseRelation
|
|
683
|
+
|
|
684
|
+
relation = adapter.Relation.create(
|
|
685
|
+
database=database,
|
|
686
|
+
schema=schema,
|
|
687
|
+
identifier=table,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
with adapter.connection_named("profile"):
|
|
691
|
+
columns = adapter.get_columns_in_relation(relation)
|
|
692
|
+
return [(col.name, col.dtype) for col in columns]
|
|
693
|
+
except Exception:
|
|
694
|
+
return []
|
|
695
|
+
|
|
696
|
+
def _get_row_count(
|
|
697
|
+
self, adapter, database: Optional[str], schema: str, table: str
|
|
698
|
+
) -> int:
|
|
699
|
+
"""Get row count from table."""
|
|
700
|
+
try:
|
|
701
|
+
fqn = self._build_fqn(adapter, database, schema, table)
|
|
702
|
+
sql = f"SELECT COUNT(*) as cnt FROM {fqn}"
|
|
703
|
+
|
|
704
|
+
with adapter.connection_named("profile"):
|
|
705
|
+
_, result = adapter.execute(sql, fetch=True)
|
|
706
|
+
if result and len(result) > 0:
|
|
707
|
+
return int(result[0][0])
|
|
708
|
+
except Exception:
|
|
709
|
+
pass
|
|
710
|
+
return 0
|
|
711
|
+
|
|
712
|
+
def _build_fqn(
|
|
713
|
+
self, adapter, database: Optional[str], schema: str, table: str
|
|
714
|
+
) -> str:
|
|
715
|
+
"""Build fully qualified table name."""
|
|
716
|
+
parts = []
|
|
717
|
+
if database:
|
|
718
|
+
parts.append(adapter.quote(database))
|
|
719
|
+
if schema:
|
|
720
|
+
parts.append(adapter.quote(schema))
|
|
721
|
+
parts.append(adapter.quote(table))
|
|
722
|
+
return ".".join(parts)
|
|
723
|
+
|
|
724
|
+
def _classify_type(self, col_type: str) -> str:
|
|
725
|
+
"""Classify database type into PipeRider generic type."""
|
|
726
|
+
col_type_lower = col_type.lower()
|
|
727
|
+
|
|
728
|
+
if any(t in col_type_lower for t in ["int", "bigint", "smallint", "tinyint", "serial"]):
|
|
729
|
+
return "integer"
|
|
730
|
+
elif any(t in col_type_lower for t in ["numeric", "decimal", "float", "double", "real", "number"]):
|
|
731
|
+
return "numeric"
|
|
732
|
+
elif any(t in col_type_lower for t in ["char", "varchar", "text", "string", "clob"]):
|
|
733
|
+
return "string"
|
|
734
|
+
elif any(t in col_type_lower for t in ["date", "time", "timestamp"]):
|
|
735
|
+
return "datetime"
|
|
736
|
+
elif any(t in col_type_lower for t in ["bool", "boolean"]):
|
|
737
|
+
return "boolean"
|
|
738
|
+
else:
|
|
739
|
+
return "other"
|
|
740
|
+
|
|
741
|
+
def _profile_column_sql(
|
|
742
|
+
self, adapter, database: Optional[str], schema: str, table: str,
|
|
743
|
+
col_name: str, col_type: str, row_count: int
|
|
744
|
+
) -> ColumnProfile:
|
|
745
|
+
"""
|
|
746
|
+
Profile a single column using efficient SQL queries.
|
|
747
|
+
|
|
748
|
+
PipeRider-style: Single-pass or minimal queries for all metrics.
|
|
749
|
+
Uses PipeRider metric names: nulls, non_nulls, distinct, valids, etc.
|
|
750
|
+
"""
|
|
751
|
+
start_time = time.time()
|
|
752
|
+
|
|
753
|
+
generic_type = self._classify_type(col_type)
|
|
754
|
+
profile = ColumnProfile(
|
|
755
|
+
name=col_name,
|
|
756
|
+
type=generic_type,
|
|
757
|
+
schema_type=col_type,
|
|
758
|
+
total=row_count,
|
|
759
|
+
samples=row_count,
|
|
760
|
+
samples_p=1.0,
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
fqn = self._build_fqn(adapter, database, schema, table)
|
|
764
|
+
quoted_col = adapter.quote(col_name)
|
|
765
|
+
|
|
766
|
+
try:
|
|
767
|
+
# Determine column type category
|
|
768
|
+
col_type_lower = col_type.lower()
|
|
769
|
+
is_numeric = any(t in col_type_lower for t in [
|
|
770
|
+
"int", "numeric", "decimal", "float", "double", "real", "number", "bigint", "smallint"
|
|
771
|
+
])
|
|
772
|
+
is_string = any(t in col_type_lower for t in [
|
|
773
|
+
"char", "varchar", "text", "string", "clob"
|
|
774
|
+
])
|
|
775
|
+
|
|
776
|
+
# Build comprehensive profiling query based on column type
|
|
777
|
+
if is_numeric:
|
|
778
|
+
profile = self._profile_numeric_column(
|
|
779
|
+
adapter, fqn, quoted_col, col_name, col_type, row_count
|
|
780
|
+
)
|
|
781
|
+
elif is_string:
|
|
782
|
+
profile = self._profile_string_column(
|
|
783
|
+
adapter, fqn, quoted_col, col_name, col_type, row_count
|
|
784
|
+
)
|
|
785
|
+
else:
|
|
786
|
+
# Default: basic metrics only
|
|
787
|
+
profile = self._profile_basic_column(
|
|
788
|
+
adapter, fqn, quoted_col, col_name, col_type, row_count
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
# Get top values for categorical columns (not in sensitive mode)
|
|
792
|
+
if self.profile_mode != "sensitive":
|
|
793
|
+
if profile.distinct and profile.distinct <= 100:
|
|
794
|
+
self._add_top_values(adapter, fqn, quoted_col, profile)
|
|
795
|
+
|
|
796
|
+
except Exception as e:
|
|
797
|
+
# If SQL fails, return what we have
|
|
798
|
+
profile.alerts.append({
|
|
799
|
+
"type": "PROFILE_ERROR",
|
|
800
|
+
"severity": "warning",
|
|
801
|
+
"message": f"Could not profile column: {str(e)[:100]}",
|
|
802
|
+
})
|
|
803
|
+
|
|
804
|
+
# Generate quality alerts
|
|
805
|
+
profile.alerts.extend(self._generate_alerts(profile))
|
|
806
|
+
|
|
807
|
+
profile.elapsed_milli = int((time.time() - start_time) * 1000)
|
|
808
|
+
profile.profile_duration = f"{(time.time() - start_time):.2f}"
|
|
809
|
+
return profile
|
|
810
|
+
|
|
811
|
+
def _profile_numeric_column(
|
|
812
|
+
self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
|
|
813
|
+
) -> ColumnProfile:
|
|
814
|
+
"""Profile a numeric column with all stats in one query (PipeRider-style)."""
|
|
815
|
+
generic_type = self._classify_type(col_type)
|
|
816
|
+
profile = ColumnProfile(
|
|
817
|
+
name=col_name,
|
|
818
|
+
type=generic_type,
|
|
819
|
+
schema_type=col_type,
|
|
820
|
+
total=row_count,
|
|
821
|
+
samples=row_count,
|
|
822
|
+
samples_p=1.0,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Single comprehensive query for numeric columns (PipeRider-style)
|
|
826
|
+
sql = f"""
|
|
827
|
+
SELECT
|
|
828
|
+
COUNT(*) - COUNT({quoted_col}) as nulls,
|
|
829
|
+
COUNT({quoted_col}) as non_nulls,
|
|
830
|
+
COUNT(DISTINCT {quoted_col}) as distinct_val,
|
|
831
|
+
MIN({quoted_col}) as min_val,
|
|
832
|
+
MAX({quoted_col}) as max_val,
|
|
833
|
+
SUM(CAST({quoted_col} AS DOUBLE PRECISION)) as sum_val,
|
|
834
|
+
AVG(CAST({quoted_col} AS DOUBLE PRECISION)) as avg_val,
|
|
835
|
+
STDDEV(CAST({quoted_col} AS DOUBLE PRECISION)) as stddev_val,
|
|
836
|
+
SUM(CASE WHEN {quoted_col} = 0 THEN 1 ELSE 0 END) as zeros,
|
|
837
|
+
SUM(CASE WHEN {quoted_col} < 0 THEN 1 ELSE 0 END) as negatives,
|
|
838
|
+
SUM(CASE WHEN {quoted_col} > 0 THEN 1 ELSE 0 END) as positives
|
|
839
|
+
FROM {fqn}
|
|
840
|
+
"""
|
|
841
|
+
|
|
842
|
+
try:
|
|
843
|
+
with adapter.connection_named("profile"):
|
|
844
|
+
_, result = adapter.execute(sql, fetch=True)
|
|
845
|
+
if result and len(result) > 0:
|
|
846
|
+
row = result[0]
|
|
847
|
+
# PipeRider-style metric names
|
|
848
|
+
profile.nulls = int(row[0] or 0)
|
|
849
|
+
profile.non_nulls = int(row[1] or 0)
|
|
850
|
+
profile.distinct = int(row[2] or 0)
|
|
851
|
+
profile.min = float(row[3]) if row[3] is not None else None
|
|
852
|
+
profile.max = float(row[4]) if row[4] is not None else None
|
|
853
|
+
profile.sum = float(row[5]) if row[5] is not None else None
|
|
854
|
+
profile.avg = float(row[6]) if row[6] is not None else None
|
|
855
|
+
profile.stddev = float(row[7]) if row[7] is not None else None
|
|
856
|
+
profile.zeros = int(row[8] or 0)
|
|
857
|
+
profile.negatives = int(row[9] or 0)
|
|
858
|
+
profile.positives = int(row[10] or 0)
|
|
859
|
+
|
|
860
|
+
# Calculate percentages (PipeRider-style with decimal 0-1)
|
|
861
|
+
if row_count > 0:
|
|
862
|
+
profile.nulls_p = profile.nulls / row_count
|
|
863
|
+
profile.non_nulls_p = profile.non_nulls / row_count
|
|
864
|
+
profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
|
|
865
|
+
profile.zeros_p = profile.zeros / row_count
|
|
866
|
+
profile.negatives_p = profile.negatives / row_count
|
|
867
|
+
profile.positives_p = profile.positives / row_count
|
|
868
|
+
|
|
869
|
+
# Validity metrics (for numeric, valid = non-null)
|
|
870
|
+
profile.valids = profile.non_nulls
|
|
871
|
+
profile.valids_p = profile.non_nulls_p
|
|
872
|
+
profile.invalids = profile.nulls
|
|
873
|
+
profile.invalids_p = profile.nulls_p
|
|
874
|
+
|
|
875
|
+
# Duplicate metrics
|
|
876
|
+
if profile.non_nulls > 0 and profile.distinct > 0:
|
|
877
|
+
profile.non_duplicates = profile.distinct
|
|
878
|
+
profile.duplicates = profile.non_nulls - profile.distinct
|
|
879
|
+
profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
|
|
880
|
+
profile.duplicates_p = profile.duplicates / profile.non_nulls
|
|
881
|
+
|
|
882
|
+
# Try to get percentiles for explorative mode
|
|
883
|
+
if self.profile_mode in ["explorative", "time-series"]:
|
|
884
|
+
self._add_percentiles(adapter, fqn, quoted_col, profile)
|
|
885
|
+
|
|
886
|
+
except Exception:
|
|
887
|
+
# Fall back to basic profile
|
|
888
|
+
profile = self._profile_basic_column(
|
|
889
|
+
adapter, fqn, quoted_col, col_name, col_type, row_count
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return profile
|
|
893
|
+
|
|
894
|
+
def _profile_string_column(
|
|
895
|
+
self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
|
|
896
|
+
) -> ColumnProfile:
|
|
897
|
+
"""Profile a string column with all stats in one query (PipeRider-style)."""
|
|
898
|
+
generic_type = self._classify_type(col_type)
|
|
899
|
+
profile = ColumnProfile(
|
|
900
|
+
name=col_name,
|
|
901
|
+
type=generic_type,
|
|
902
|
+
schema_type=col_type,
|
|
903
|
+
total=row_count,
|
|
904
|
+
samples=row_count,
|
|
905
|
+
samples_p=1.0,
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
# Single comprehensive query for string columns (PipeRider-style)
|
|
909
|
+
sql = f"""
|
|
910
|
+
SELECT
|
|
911
|
+
COUNT(*) - COUNT({quoted_col}) as nulls,
|
|
912
|
+
COUNT({quoted_col}) as non_nulls,
|
|
913
|
+
COUNT(DISTINCT {quoted_col}) as distinct_val,
|
|
914
|
+
MIN(LENGTH({quoted_col})) as min_len,
|
|
915
|
+
MAX(LENGTH({quoted_col})) as max_len,
|
|
916
|
+
AVG(LENGTH({quoted_col})) as avg_len,
|
|
917
|
+
SUM(CASE WHEN LENGTH({quoted_col}) = 0 THEN 1 ELSE 0 END) as zero_length_count
|
|
918
|
+
FROM {fqn}
|
|
919
|
+
"""
|
|
920
|
+
|
|
921
|
+
try:
|
|
922
|
+
with adapter.connection_named("profile"):
|
|
923
|
+
_, result = adapter.execute(sql, fetch=True)
|
|
924
|
+
if result and len(result) > 0:
|
|
925
|
+
row = result[0]
|
|
926
|
+
# PipeRider-style metric names
|
|
927
|
+
profile.nulls = int(row[0] or 0)
|
|
928
|
+
profile.non_nulls = int(row[1] or 0)
|
|
929
|
+
profile.distinct = int(row[2] or 0)
|
|
930
|
+
profile.min_length = int(row[3]) if row[3] is not None else None
|
|
931
|
+
profile.max_length = int(row[4]) if row[4] is not None else None
|
|
932
|
+
profile.avg_length = float(row[5]) if row[5] is not None else None
|
|
933
|
+
profile.zero_length = int(row[6] or 0)
|
|
934
|
+
|
|
935
|
+
# Calculate percentages (PipeRider-style with decimal 0-1)
|
|
936
|
+
if row_count > 0:
|
|
937
|
+
profile.nulls_p = profile.nulls / row_count
|
|
938
|
+
profile.non_nulls_p = profile.non_nulls / row_count
|
|
939
|
+
profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
|
|
940
|
+
profile.zero_length_p = profile.zero_length / row_count
|
|
941
|
+
|
|
942
|
+
# Validity metrics (for string, valid = non-null non-empty)
|
|
943
|
+
profile.valids = profile.non_nulls - profile.zero_length
|
|
944
|
+
profile.invalids = profile.nulls + profile.zero_length
|
|
945
|
+
if row_count > 0:
|
|
946
|
+
profile.valids_p = profile.valids / row_count
|
|
947
|
+
profile.invalids_p = profile.invalids / row_count
|
|
948
|
+
|
|
949
|
+
# Non-zero length
|
|
950
|
+
profile.non_zero_length = profile.non_nulls - profile.zero_length
|
|
951
|
+
if profile.non_nulls > 0:
|
|
952
|
+
profile.non_zero_length_p = profile.non_zero_length / profile.non_nulls
|
|
953
|
+
|
|
954
|
+
# Duplicate metrics
|
|
955
|
+
if profile.non_nulls > 0 and profile.distinct > 0:
|
|
956
|
+
profile.non_duplicates = profile.distinct
|
|
957
|
+
profile.duplicates = profile.non_nulls - profile.distinct
|
|
958
|
+
profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
|
|
959
|
+
profile.duplicates_p = profile.duplicates / profile.non_nulls
|
|
960
|
+
|
|
961
|
+
except Exception:
|
|
962
|
+
# Fall back to basic profile
|
|
963
|
+
profile = self._profile_basic_column(
|
|
964
|
+
adapter, fqn, quoted_col, col_name, col_type, row_count
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
return profile
|
|
968
|
+
|
|
969
|
+
def _profile_basic_column(
|
|
970
|
+
self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
|
|
971
|
+
) -> ColumnProfile:
|
|
972
|
+
"""Profile any column with basic metrics only (PipeRider-style)."""
|
|
973
|
+
generic_type = self._classify_type(col_type)
|
|
974
|
+
profile = ColumnProfile(
|
|
975
|
+
name=col_name,
|
|
976
|
+
type=generic_type,
|
|
977
|
+
schema_type=col_type,
|
|
978
|
+
total=row_count,
|
|
979
|
+
samples=row_count,
|
|
980
|
+
samples_p=1.0,
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
sql = f"""
|
|
984
|
+
SELECT
|
|
985
|
+
COUNT(*) - COUNT({quoted_col}) as nulls,
|
|
986
|
+
COUNT({quoted_col}) as non_nulls,
|
|
987
|
+
COUNT(DISTINCT {quoted_col}) as distinct_val
|
|
988
|
+
FROM {fqn}
|
|
989
|
+
"""
|
|
990
|
+
|
|
991
|
+
try:
|
|
992
|
+
with adapter.connection_named("profile"):
|
|
993
|
+
_, result = adapter.execute(sql, fetch=True)
|
|
994
|
+
if result and len(result) > 0:
|
|
995
|
+
profile.nulls = int(result[0][0] or 0)
|
|
996
|
+
profile.non_nulls = int(result[0][1] or 0)
|
|
997
|
+
profile.distinct = int(result[0][2] or 0)
|
|
998
|
+
|
|
999
|
+
# Calculate percentages (PipeRider-style with decimal 0-1)
|
|
1000
|
+
if row_count > 0:
|
|
1001
|
+
profile.nulls_p = profile.nulls / row_count
|
|
1002
|
+
profile.non_nulls_p = profile.non_nulls / row_count
|
|
1003
|
+
profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
|
|
1004
|
+
|
|
1005
|
+
# Validity metrics
|
|
1006
|
+
profile.valids = profile.non_nulls
|
|
1007
|
+
profile.valids_p = profile.non_nulls_p
|
|
1008
|
+
profile.invalids = profile.nulls
|
|
1009
|
+
profile.invalids_p = profile.nulls_p
|
|
1010
|
+
|
|
1011
|
+
# Duplicate metrics
|
|
1012
|
+
if profile.non_nulls > 0 and profile.distinct > 0:
|
|
1013
|
+
profile.non_duplicates = profile.distinct
|
|
1014
|
+
profile.duplicates = profile.non_nulls - profile.distinct
|
|
1015
|
+
profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
|
|
1016
|
+
profile.duplicates_p = profile.duplicates / profile.non_nulls
|
|
1017
|
+
|
|
1018
|
+
except Exception:
|
|
1019
|
+
pass
|
|
1020
|
+
|
|
1021
|
+
return profile
|
|
1022
|
+
|
|
1023
|
+
def _add_percentiles(self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile) -> None:
|
|
1024
|
+
"""Try to add percentiles to numeric profile."""
|
|
1025
|
+
try:
|
|
1026
|
+
# Try PostgreSQL/Redshift style
|
|
1027
|
+
percentile_sql = f"""
|
|
1028
|
+
SELECT
|
|
1029
|
+
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {quoted_col}) as p25,
|
|
1030
|
+
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY {quoted_col}) as p50,
|
|
1031
|
+
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {quoted_col}) as p75
|
|
1032
|
+
FROM {fqn}
|
|
1033
|
+
WHERE {quoted_col} IS NOT NULL
|
|
1034
|
+
"""
|
|
1035
|
+
with adapter.connection_named("profile"):
|
|
1036
|
+
_, result = adapter.execute(percentile_sql, fetch=True)
|
|
1037
|
+
if result and len(result) > 0:
|
|
1038
|
+
row = result[0]
|
|
1039
|
+
profile.p25 = float(row[0]) if row[0] is not None else None
|
|
1040
|
+
profile.p50 = float(row[1]) if row[1] is not None else None
|
|
1041
|
+
profile.p75 = float(row[2]) if row[2] is not None else None
|
|
1042
|
+
profile.median_value = profile.p50
|
|
1043
|
+
except Exception:
|
|
1044
|
+
# Percentiles not supported on this database
|
|
1045
|
+
pass
|
|
1046
|
+
|
|
1047
|
+
def _add_top_values(self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile) -> None:
|
|
1048
|
+
"""Add top values to profile (PipeRider topk format)."""
|
|
1049
|
+
try:
|
|
1050
|
+
top_sql = f"""
|
|
1051
|
+
SELECT {quoted_col} as val, COUNT(*) as cnt
|
|
1052
|
+
FROM {fqn}
|
|
1053
|
+
WHERE {quoted_col} IS NOT NULL
|
|
1054
|
+
GROUP BY {quoted_col}
|
|
1055
|
+
ORDER BY cnt DESC
|
|
1056
|
+
LIMIT 10
|
|
1057
|
+
"""
|
|
1058
|
+
with adapter.connection_named("profile"):
|
|
1059
|
+
_, result = adapter.execute(top_sql, fetch=True)
|
|
1060
|
+
if result:
|
|
1061
|
+
# PipeRider topk format: {"values": [...], "counts": [...]}
|
|
1062
|
+
values = [str(row[0]) for row in result]
|
|
1063
|
+
counts = [int(row[1]) for row in result]
|
|
1064
|
+
profile.topk = {
|
|
1065
|
+
"values": values,
|
|
1066
|
+
"counts": counts,
|
|
1067
|
+
}
|
|
1068
|
+
except Exception:
|
|
1069
|
+
pass
|
|
1070
|
+
|
|
1071
|
+
def _generate_alerts(self, profile: ColumnProfile) -> List[Dict]:
|
|
1072
|
+
"""
|
|
1073
|
+
Generate quality alerts for a column profile (PipeRider-style).
|
|
1074
|
+
|
|
1075
|
+
PipeRider alert types (from piperider_cli/profiler/event.py):
|
|
1076
|
+
- missing_value: High percentage of null/missing values
|
|
1077
|
+
- high_distinct: Very high cardinality (possible PK)
|
|
1078
|
+
- low_distinct: Very low cardinality (possible boolean/flag)
|
|
1079
|
+
- all_null: 100% null values
|
|
1080
|
+
- constant: All values are the same
|
|
1081
|
+
- negative_value: Has negative values in numeric column
|
|
1082
|
+
- zero_length_string: Has empty strings
|
|
1083
|
+
"""
|
|
1084
|
+
alerts = []
|
|
1085
|
+
|
|
1086
|
+
# Get null percentage (as 0-100 for comparison)
|
|
1087
|
+
nulls_pct = (profile.nulls_p or 0) * 100 if profile.nulls_p is not None else 0
|
|
1088
|
+
distinct_pct = (profile.distinct_p or 0) * 100 if profile.distinct_p is not None else 0
|
|
1089
|
+
|
|
1090
|
+
# High null rate alert (PipeRider: missing_value)
|
|
1091
|
+
if nulls_pct > 50:
|
|
1092
|
+
alerts.append({
|
|
1093
|
+
"type": "missing_value",
|
|
1094
|
+
"severity": "error",
|
|
1095
|
+
"column": profile.name,
|
|
1096
|
+
"message": f"Column has {nulls_pct:.1f}% null values (>50%)",
|
|
1097
|
+
})
|
|
1098
|
+
elif nulls_pct > 20:
|
|
1099
|
+
alerts.append({
|
|
1100
|
+
"type": "missing_value",
|
|
1101
|
+
"severity": "warning",
|
|
1102
|
+
"column": profile.name,
|
|
1103
|
+
"message": f"Column has {nulls_pct:.1f}% null values",
|
|
1104
|
+
})
|
|
1105
|
+
|
|
1106
|
+
# High cardinality alert (PipeRider: high_distinct)
|
|
1107
|
+
if distinct_pct > 99 and profile.samples > 100:
|
|
1108
|
+
alerts.append({
|
|
1109
|
+
"type": "high_distinct",
|
|
1110
|
+
"severity": "info",
|
|
1111
|
+
"column": profile.name,
|
|
1112
|
+
"message": f"Column is {distinct_pct:.1f}% unique (possible primary key)",
|
|
1113
|
+
})
|
|
1114
|
+
|
|
1115
|
+
# Low cardinality (PipeRider: low_distinct)
|
|
1116
|
+
if profile.distinct and profile.distinct < 10 and profile.samples > 1000:
|
|
1117
|
+
alerts.append({
|
|
1118
|
+
"type": "low_distinct",
|
|
1119
|
+
"severity": "info",
|
|
1120
|
+
"column": profile.name,
|
|
1121
|
+
"message": f"Column has only {profile.distinct} distinct values (possible category)",
|
|
1122
|
+
})
|
|
1123
|
+
|
|
1124
|
+
# All nulls alert (PipeRider: all_null)
|
|
1125
|
+
if nulls_pct >= 100 or (profile.non_nulls == 0 and profile.nulls > 0):
|
|
1126
|
+
alerts.append({
|
|
1127
|
+
"type": "all_null",
|
|
1128
|
+
"severity": "error",
|
|
1129
|
+
"column": profile.name,
|
|
1130
|
+
"message": "Column is 100% null - consider removing",
|
|
1131
|
+
})
|
|
1132
|
+
|
|
1133
|
+
# Zero variance / Constant alert (PipeRider: constant)
|
|
1134
|
+
if profile.min is not None and profile.max is not None:
|
|
1135
|
+
if profile.min == profile.max and profile.distinct == 1:
|
|
1136
|
+
alerts.append({
|
|
1137
|
+
"type": "constant",
|
|
1138
|
+
"severity": "warning",
|
|
1139
|
+
"column": profile.name,
|
|
1140
|
+
"message": f"Column has constant value: {profile.min}",
|
|
1141
|
+
})
|
|
1142
|
+
|
|
1143
|
+
# Negative values (PipeRider: negative_value) - informational only
|
|
1144
|
+
if profile.negatives and profile.negatives > 0:
|
|
1145
|
+
negatives_pct = (profile.negatives_p or 0) * 100
|
|
1146
|
+
if negatives_pct > 50:
|
|
1147
|
+
alerts.append({
|
|
1148
|
+
"type": "negative_value",
|
|
1149
|
+
"severity": "info",
|
|
1150
|
+
"column": profile.name,
|
|
1151
|
+
"message": f"Column has {negatives_pct:.1f}% negative values",
|
|
1152
|
+
})
|
|
1153
|
+
|
|
1154
|
+
# Zero-length strings (PipeRider: zero_length_string)
|
|
1155
|
+
if profile.zero_length and profile.zero_length > 0:
|
|
1156
|
+
zero_len_pct = (profile.zero_length_p or 0) * 100
|
|
1157
|
+
if zero_len_pct > 10:
|
|
1158
|
+
alerts.append({
|
|
1159
|
+
"type": "zero_length_string",
|
|
1160
|
+
"severity": "warning",
|
|
1161
|
+
"column": profile.name,
|
|
1162
|
+
"message": f"Column has {zero_len_pct:.1f}% empty strings",
|
|
1163
|
+
})
|
|
1164
|
+
|
|
1165
|
+
return alerts
|
|
1166
|
+
|
|
1167
|
+
def _store_profile(self, profile: TableProfile) -> None:
|
|
1168
|
+
"""Store profile results in metadata_store.duckdb."""
|
|
1169
|
+
try:
|
|
1170
|
+
# Check if DuckDB is available
|
|
1171
|
+
try:
|
|
1172
|
+
import duckdb
|
|
1173
|
+
except ImportError:
|
|
1174
|
+
if HAS_RICH:
|
|
1175
|
+
console.print("[yellow]Warning: DuckDB not available. Profile results will not be persisted.[/yellow]")
|
|
1176
|
+
return
|
|
1177
|
+
|
|
1178
|
+
from dbt.compute.metadata import ProjectMetadataStore, ColumnProfileResult
|
|
1179
|
+
|
|
1180
|
+
project_root = Path(self.runtime_config.project_root)
|
|
1181
|
+
store = ProjectMetadataStore(project_root)
|
|
1182
|
+
store.initialize()
|
|
1183
|
+
|
|
1184
|
+
for col in profile.columns:
|
|
1185
|
+
result = ColumnProfileResult(
|
|
1186
|
+
source_name=profile.source_name,
|
|
1187
|
+
table_name=profile.table_name,
|
|
1188
|
+
column_name=col.column_name,
|
|
1189
|
+
profile_mode=profile.profile_mode,
|
|
1190
|
+
row_count=col.row_count,
|
|
1191
|
+
null_count=col.null_count,
|
|
1192
|
+
null_percent=col.null_percent,
|
|
1193
|
+
distinct_count=col.distinct_count,
|
|
1194
|
+
distinct_percent=col.distinct_percent,
|
|
1195
|
+
min_value=col.min_value,
|
|
1196
|
+
max_value=col.max_value,
|
|
1197
|
+
mean_value=col.mean_value,
|
|
1198
|
+
median_value=col.median_value,
|
|
1199
|
+
stddev_value=col.stddev_value,
|
|
1200
|
+
p25=col.p25,
|
|
1201
|
+
p50=col.p50,
|
|
1202
|
+
p75=col.p75,
|
|
1203
|
+
min_length=col.min_length,
|
|
1204
|
+
max_length=col.max_length,
|
|
1205
|
+
avg_length=col.avg_length,
|
|
1206
|
+
histogram=json.dumps(col.histogram) if col.histogram else None,
|
|
1207
|
+
top_values=json.dumps(col.top_values) if col.top_values else None,
|
|
1208
|
+
alerts=json.dumps(col.alerts) if col.alerts else None,
|
|
1209
|
+
profiled_at=profile.profiled_at,
|
|
1210
|
+
duration_ms=col.duration_ms,
|
|
1211
|
+
)
|
|
1212
|
+
store.save_profile_result(result)
|
|
1213
|
+
|
|
1214
|
+
store.close()
|
|
1215
|
+
|
|
1216
|
+
except Exception as e:
|
|
1217
|
+
# Log but don't fail if storage fails
|
|
1218
|
+
if HAS_RICH:
|
|
1219
|
+
console.print(f"[yellow]Warning: Could not store profile results: {e}[/yellow]")
|
|
1220
|
+
|
|
1221
|
+
def _print_summary(self, result: ProfileExecutionResult) -> None:
|
|
1222
|
+
"""Print PipeRider-style summary with Rich formatting."""
|
|
1223
|
+
if not HAS_RICH:
|
|
1224
|
+
print("\n" + "=" * 60)
|
|
1225
|
+
print(" SUMMARY")
|
|
1226
|
+
print(f" Tables profiled: {result.tables_profiled}")
|
|
1227
|
+
print(f" Total rows: {result.total_rows:,}")
|
|
1228
|
+
print(f" Total columns: {result.total_columns}")
|
|
1229
|
+
print(f" Alerts: {result.total_alerts}")
|
|
1230
|
+
print(f" Duration: {result.duration_ms / 1000:.1f}s")
|
|
1231
|
+
print("=" * 60 + "\n")
|
|
1232
|
+
return
|
|
1233
|
+
|
|
1234
|
+
console.print()
|
|
1235
|
+
|
|
1236
|
+
# Summary panel
|
|
1237
|
+
summary_lines = [
|
|
1238
|
+
f"[bold]Tables profiled:[/bold] {result.tables_profiled}",
|
|
1239
|
+
f"[bold]Total rows:[/bold] {result.total_rows:,}",
|
|
1240
|
+
f"[bold]Total columns:[/bold] {result.total_columns}",
|
|
1241
|
+
]
|
|
1242
|
+
|
|
1243
|
+
if result.total_alerts > 0:
|
|
1244
|
+
summary_lines.append(f"[bold yellow]Alerts:[/bold yellow] {result.total_alerts}")
|
|
1245
|
+
else:
|
|
1246
|
+
summary_lines.append(f"[bold green]Alerts:[/bold green] 0")
|
|
1247
|
+
|
|
1248
|
+
summary_lines.append(f"[dim]Duration:[/dim] {result.duration_ms / 1000:.1f}s")
|
|
1249
|
+
|
|
1250
|
+
console.print(Panel(
|
|
1251
|
+
"\n".join(summary_lines),
|
|
1252
|
+
title="[bold cyan]Summary[/bold cyan]",
|
|
1253
|
+
border_style="cyan",
|
|
1254
|
+
box=box.ROUNDED,
|
|
1255
|
+
))
|
|
1256
|
+
|
|
1257
|
+
# List alerts if any
|
|
1258
|
+
if result.total_alerts > 0:
|
|
1259
|
+
console.print()
|
|
1260
|
+
console.print("[bold yellow]Alerts:[/bold yellow]")
|
|
1261
|
+
console.print()
|
|
1262
|
+
|
|
1263
|
+
alerts_table = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
|
1264
|
+
alerts_table.add_column("Severity", style="bold", width=8)
|
|
1265
|
+
alerts_table.add_column("Type", style="cyan", width=15)
|
|
1266
|
+
alerts_table.add_column("Location", style="white", width=30)
|
|
1267
|
+
alerts_table.add_column("Message", style="dim")
|
|
1268
|
+
|
|
1269
|
+
for profile in result.profiles:
|
|
1270
|
+
for col in profile.columns:
|
|
1271
|
+
for alert in col.alerts:
|
|
1272
|
+
if alert["severity"] == "error":
|
|
1273
|
+
sev_display = "[red]ERROR[/red]"
|
|
1274
|
+
elif alert["severity"] == "warning":
|
|
1275
|
+
sev_display = "[yellow]WARN[/yellow]"
|
|
1276
|
+
else:
|
|
1277
|
+
sev_display = "[blue]INFO[/blue]"
|
|
1278
|
+
|
|
1279
|
+
location = f"{profile.table_name}.{col.column_name}"
|
|
1280
|
+
alerts_table.add_row(
|
|
1281
|
+
sev_display,
|
|
1282
|
+
alert["type"],
|
|
1283
|
+
location,
|
|
1284
|
+
alert["message"]
|
|
1285
|
+
)
|
|
1286
|
+
|
|
1287
|
+
console.print(alerts_table)
|
|
1288
|
+
|
|
1289
|
+
console.print()
|
|
1290
|
+
|
|
1291
|
+
# Success footer
|
|
1292
|
+
if result.tables_profiled > 0:
|
|
1293
|
+
console.print("[bold green]Profiling complete![/bold green]")
|
|
1294
|
+
console.print()
|
|
1295
|
+
console.print("[cyan]Results saved to:[/cyan] [bold].dvt/metadata_store.duckdb[/bold]")
|
|
1296
|
+
console.print("[dim]View report: dvt profile serve[/dim]")
|
|
1297
|
+
else:
|
|
1298
|
+
console.print("[yellow]No tables were profiled.[/yellow]")
|
|
1299
|
+
|
|
1300
|
+
console.print()
|
|
1301
|
+
|
|
1302
|
+
def interpret_results(self, result: ProfileExecutionResult) -> bool:
|
|
1303
|
+
"""Interpret results to determine success/failure."""
|
|
1304
|
+
if not result.profiles:
|
|
1305
|
+
return False
|
|
1306
|
+
# Success if at least one profile completed
|
|
1307
|
+
return any(p.status == "success" for p in result.profiles)
|