dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2403 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
  74. dbt/compute/engines/spark_engine.py +642 -0
  75. dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
  76. dbt/compute/federated_executor.py +1080 -0
  77. dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
  78. dbt/compute/filter_pushdown.py +273 -0
  79. dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
  80. dbt/compute/jar_provisioning.py +255 -0
  81. dbt/compute/java_compat.cpython-311-darwin.so +0 -0
  82. dbt/compute/java_compat.py +689 -0
  83. dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
  84. dbt/compute/jdbc_utils.py +678 -0
  85. dbt/compute/metadata/__init__.py +40 -0
  86. dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
  87. dbt/compute/metadata/adapters_registry.py +370 -0
  88. dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
  89. dbt/compute/metadata/registry.py +674 -0
  90. dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
  91. dbt/compute/metadata/store.py +1499 -0
  92. dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
  93. dbt/compute/smart_selector.py +377 -0
  94. dbt/compute/strategies/__init__.py +55 -0
  95. dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
  96. dbt/compute/strategies/base.py +165 -0
  97. dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
  98. dbt/compute/strategies/dataproc.py +207 -0
  99. dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
  100. dbt/compute/strategies/emr.py +203 -0
  101. dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
  102. dbt/compute/strategies/local.py +443 -0
  103. dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
  104. dbt/compute/strategies/standalone.py +262 -0
  105. dbt/config/__init__.py +4 -0
  106. dbt/config/catalogs.py +94 -0
  107. dbt/config/compute.cpython-311-darwin.so +0 -0
  108. dbt/config/compute.py +513 -0
  109. dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
  110. dbt/config/dvt_profile.py +342 -0
  111. dbt/config/profile.py +422 -0
  112. dbt/config/project.py +873 -0
  113. dbt/config/project_utils.py +28 -0
  114. dbt/config/renderer.py +231 -0
  115. dbt/config/runtime.py +553 -0
  116. dbt/config/selectors.py +208 -0
  117. dbt/config/utils.py +77 -0
  118. dbt/constants.py +28 -0
  119. dbt/context/__init__.py +0 -0
  120. dbt/context/base.py +745 -0
  121. dbt/context/configured.py +135 -0
  122. dbt/context/context_config.py +382 -0
  123. dbt/context/docs.py +82 -0
  124. dbt/context/exceptions_jinja.py +178 -0
  125. dbt/context/macro_resolver.py +195 -0
  126. dbt/context/macros.py +171 -0
  127. dbt/context/manifest.py +72 -0
  128. dbt/context/providers.py +2249 -0
  129. dbt/context/query_header.py +13 -0
  130. dbt/context/secret.py +58 -0
  131. dbt/context/target.py +74 -0
  132. dbt/contracts/__init__.py +0 -0
  133. dbt/contracts/files.py +413 -0
  134. dbt/contracts/graph/__init__.py +0 -0
  135. dbt/contracts/graph/manifest.py +1904 -0
  136. dbt/contracts/graph/metrics.py +97 -0
  137. dbt/contracts/graph/model_config.py +70 -0
  138. dbt/contracts/graph/node_args.py +42 -0
  139. dbt/contracts/graph/nodes.py +1806 -0
  140. dbt/contracts/graph/semantic_manifest.py +232 -0
  141. dbt/contracts/graph/unparsed.py +811 -0
  142. dbt/contracts/project.py +417 -0
  143. dbt/contracts/results.py +53 -0
  144. dbt/contracts/selection.py +23 -0
  145. dbt/contracts/sql.py +85 -0
  146. dbt/contracts/state.py +68 -0
  147. dbt/contracts/util.py +46 -0
  148. dbt/deprecations.py +348 -0
  149. dbt/deps/__init__.py +0 -0
  150. dbt/deps/base.py +152 -0
  151. dbt/deps/git.py +195 -0
  152. dbt/deps/local.py +79 -0
  153. dbt/deps/registry.py +130 -0
  154. dbt/deps/resolver.py +149 -0
  155. dbt/deps/tarball.py +120 -0
  156. dbt/docs/source/_ext/dbt_click.py +119 -0
  157. dbt/docs/source/conf.py +32 -0
  158. dbt/env_vars.py +64 -0
  159. dbt/event_time/event_time.py +40 -0
  160. dbt/event_time/sample_window.py +60 -0
  161. dbt/events/__init__.py +15 -0
  162. dbt/events/base_types.py +36 -0
  163. dbt/events/core_types_pb2.py +2 -0
  164. dbt/events/logging.py +108 -0
  165. dbt/events/types.py +2516 -0
  166. dbt/exceptions.py +1486 -0
  167. dbt/flags.py +89 -0
  168. dbt/graph/__init__.py +11 -0
  169. dbt/graph/cli.py +249 -0
  170. dbt/graph/graph.py +172 -0
  171. dbt/graph/queue.py +214 -0
  172. dbt/graph/selector.py +374 -0
  173. dbt/graph/selector_methods.py +975 -0
  174. dbt/graph/selector_spec.py +222 -0
  175. dbt/graph/thread_pool.py +18 -0
  176. dbt/hooks.py +21 -0
  177. dbt/include/README.md +49 -0
  178. dbt/include/__init__.py +3 -0
  179. dbt/include/data/adapters_registry.duckdb +0 -0
  180. dbt/include/data/build_registry.py +242 -0
  181. dbt/include/data/csv/adapter_queries.csv +33 -0
  182. dbt/include/data/csv/syntax_rules.csv +9 -0
  183. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  184. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  185. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  186. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  187. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  188. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  189. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  190. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  191. dbt/include/starter_project/.gitignore +4 -0
  192. dbt/include/starter_project/README.md +15 -0
  193. dbt/include/starter_project/__init__.py +3 -0
  194. dbt/include/starter_project/analyses/.gitkeep +0 -0
  195. dbt/include/starter_project/dbt_project.yml +36 -0
  196. dbt/include/starter_project/macros/.gitkeep +0 -0
  197. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  198. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  199. dbt/include/starter_project/models/example/schema.yml +21 -0
  200. dbt/include/starter_project/seeds/.gitkeep +0 -0
  201. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  202. dbt/include/starter_project/tests/.gitkeep +0 -0
  203. dbt/internal_deprecations.py +26 -0
  204. dbt/jsonschemas/__init__.py +3 -0
  205. dbt/jsonschemas/jsonschemas.py +309 -0
  206. dbt/jsonschemas/project/0.0.110.json +4717 -0
  207. dbt/jsonschemas/project/0.0.85.json +2015 -0
  208. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  209. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  210. dbt/jsonschemas/resources/latest.json +6773 -0
  211. dbt/links.py +4 -0
  212. dbt/materializations/__init__.py +0 -0
  213. dbt/materializations/incremental/__init__.py +0 -0
  214. dbt/materializations/incremental/microbatch.py +236 -0
  215. dbt/mp_context.py +8 -0
  216. dbt/node_types.py +37 -0
  217. dbt/parser/__init__.py +23 -0
  218. dbt/parser/analysis.py +21 -0
  219. dbt/parser/base.py +548 -0
  220. dbt/parser/common.py +266 -0
  221. dbt/parser/docs.py +52 -0
  222. dbt/parser/fixtures.py +51 -0
  223. dbt/parser/functions.py +30 -0
  224. dbt/parser/generic_test.py +100 -0
  225. dbt/parser/generic_test_builders.py +333 -0
  226. dbt/parser/hooks.py +118 -0
  227. dbt/parser/macros.py +137 -0
  228. dbt/parser/manifest.py +2204 -0
  229. dbt/parser/models.py +573 -0
  230. dbt/parser/partial.py +1178 -0
  231. dbt/parser/read_files.py +445 -0
  232. dbt/parser/schema_generic_tests.py +422 -0
  233. dbt/parser/schema_renderer.py +111 -0
  234. dbt/parser/schema_yaml_readers.py +935 -0
  235. dbt/parser/schemas.py +1466 -0
  236. dbt/parser/search.py +149 -0
  237. dbt/parser/seeds.py +28 -0
  238. dbt/parser/singular_test.py +20 -0
  239. dbt/parser/snapshots.py +44 -0
  240. dbt/parser/sources.py +558 -0
  241. dbt/parser/sql.py +62 -0
  242. dbt/parser/unit_tests.py +621 -0
  243. dbt/plugins/__init__.py +20 -0
  244. dbt/plugins/contracts.py +9 -0
  245. dbt/plugins/exceptions.py +2 -0
  246. dbt/plugins/manager.py +163 -0
  247. dbt/plugins/manifest.py +21 -0
  248. dbt/profiler.py +20 -0
  249. dbt/py.typed +1 -0
  250. dbt/query_analyzer.cpython-311-darwin.so +0 -0
  251. dbt/query_analyzer.py +410 -0
  252. dbt/runners/__init__.py +2 -0
  253. dbt/runners/exposure_runner.py +7 -0
  254. dbt/runners/no_op_runner.py +45 -0
  255. dbt/runners/saved_query_runner.py +7 -0
  256. dbt/selected_resources.py +8 -0
  257. dbt/task/__init__.py +0 -0
  258. dbt/task/base.py +503 -0
  259. dbt/task/build.py +197 -0
  260. dbt/task/clean.py +56 -0
  261. dbt/task/clone.py +161 -0
  262. dbt/task/compile.py +150 -0
  263. dbt/task/compute.cpython-311-darwin.so +0 -0
  264. dbt/task/compute.py +458 -0
  265. dbt/task/debug.py +505 -0
  266. dbt/task/deps.py +280 -0
  267. dbt/task/docs/__init__.py +3 -0
  268. dbt/task/docs/api/__init__.py +23 -0
  269. dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
  270. dbt/task/docs/api/catalog.py +204 -0
  271. dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
  272. dbt/task/docs/api/lineage.py +234 -0
  273. dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
  274. dbt/task/docs/api/profile.py +204 -0
  275. dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
  276. dbt/task/docs/api/spark.py +186 -0
  277. dbt/task/docs/generate.py +947 -0
  278. dbt/task/docs/index.html +250 -0
  279. dbt/task/docs/serve.cpython-311-darwin.so +0 -0
  280. dbt/task/docs/serve.py +174 -0
  281. dbt/task/dvt_output.py +362 -0
  282. dbt/task/dvt_run.py +204 -0
  283. dbt/task/freshness.py +322 -0
  284. dbt/task/function.py +121 -0
  285. dbt/task/group_lookup.py +46 -0
  286. dbt/task/init.cpython-311-darwin.so +0 -0
  287. dbt/task/init.py +604 -0
  288. dbt/task/java.cpython-311-darwin.so +0 -0
  289. dbt/task/java.py +316 -0
  290. dbt/task/list.py +236 -0
  291. dbt/task/metadata.cpython-311-darwin.so +0 -0
  292. dbt/task/metadata.py +804 -0
  293. dbt/task/printer.py +175 -0
  294. dbt/task/profile.cpython-311-darwin.so +0 -0
  295. dbt/task/profile.py +1307 -0
  296. dbt/task/profile_serve.py +615 -0
  297. dbt/task/retract.py +438 -0
  298. dbt/task/retry.py +175 -0
  299. dbt/task/run.py +1387 -0
  300. dbt/task/run_operation.py +141 -0
  301. dbt/task/runnable.py +758 -0
  302. dbt/task/seed.py +103 -0
  303. dbt/task/show.py +149 -0
  304. dbt/task/snapshot.py +56 -0
  305. dbt/task/spark.cpython-311-darwin.so +0 -0
  306. dbt/task/spark.py +414 -0
  307. dbt/task/sql.py +110 -0
  308. dbt/task/target_sync.cpython-311-darwin.so +0 -0
  309. dbt/task/target_sync.py +766 -0
  310. dbt/task/test.py +464 -0
  311. dbt/tests/fixtures/__init__.py +1 -0
  312. dbt/tests/fixtures/project.py +620 -0
  313. dbt/tests/util.py +651 -0
  314. dbt/tracking.py +529 -0
  315. dbt/utils/__init__.py +3 -0
  316. dbt/utils/artifact_upload.py +151 -0
  317. dbt/utils/utils.py +408 -0
  318. dbt/version.py +270 -0
  319. dvt_cli/__init__.py +72 -0
  320. dvt_core-0.58.6.dist-info/METADATA +288 -0
  321. dvt_core-0.58.6.dist-info/RECORD +324 -0
  322. dvt_core-0.58.6.dist-info/WHEEL +5 -0
  323. dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
  324. dvt_core-0.58.6.dist-info/top_level.txt +2 -0
dbt/task/profile.py ADDED
@@ -0,0 +1,1307 @@
1
+ """
2
+ DVT Profile Task
3
+
4
+ Data profiling task with DAG-based execution for sources and models.
5
+ Works like 'dvt run' with full selector support and DVT compute rules.
6
+
7
+ v0.56.0: Initial implementation with 4 profiling modes.
8
+ v0.58.1: PipeRider-style profiling - fast SQL-based metrics instead of ydata-profiling.
9
+
10
+ Modes:
11
+ - explorative: Full profiling (distributions, patterns, correlations) [DEFAULT]
12
+ - minimal: Basic stats (null%, distinct%, min/max)
13
+ - sensitive: Redacted profiling (masks PII-like columns)
14
+ - time-series: Temporal analysis (ACF, PACF, seasonality)
15
+
16
+ PipeRider-Style Metrics (v0.58.1):
17
+ - row_count, column_count
18
+ - null_count, null_percent
19
+ - distinct_count, distinct_percent
20
+ - min, max, mean, median, stddev
21
+ - top_values (most frequent)
22
+ - data_type distribution
23
+ """
24
+
25
+ import json
26
+ import time
27
+ from concurrent.futures import ThreadPoolExecutor, as_completed
28
+ from dataclasses import dataclass, field
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+ from typing import Any, Dict, List, Optional, Set
32
+
33
+ import click
34
+
35
+ # Try to import Rich for beautiful CLI output
36
+ try:
37
+ from rich.console import Console
38
+ from rich.progress import (
39
+ Progress,
40
+ TextColumn,
41
+ BarColumn,
42
+ MofNCompleteColumn,
43
+ TimeElapsedColumn,
44
+ SpinnerColumn,
45
+ TaskProgressColumn,
46
+ )
47
+ from rich.table import Table
48
+ from rich import box
49
+ from rich.panel import Panel
50
+ from rich.style import Style
51
+ from rich.live import Live
52
+ HAS_RICH = True
53
+ except ImportError:
54
+ HAS_RICH = False
55
+
56
+ from dbt.artifacts.schemas.run import RunStatus
57
+ from dbt.config.runtime import RuntimeConfig
58
+ from dbt.contracts.graph.manifest import Manifest
59
+ from dbt.contracts.graph.nodes import SourceDefinition, ModelNode
60
+ from dbt.task.base import BaseTask
61
+
62
+ # Initialize Rich console
63
+ console = Console() if HAS_RICH else None
64
+
65
+
66
+ @dataclass
67
+ class ColumnProfile:
68
+ """
69
+ Profile result for a single column (PipeRider-style metrics).
70
+
71
+ PipeRider Metric Names (exact copy from piperider_cli/profiler/profiler.py):
72
+ - total: Total row count in table
73
+ - samples: Number of sampled rows (same as total if no sampling)
74
+ - samples_p: Sampling percentage (1.0 = 100%)
75
+ - non_nulls: Count of non-null values
76
+ - non_nulls_p: Percentage of non-null values
77
+ - nulls: Count of null values
78
+ - nulls_p: Percentage of null values
79
+ - valids: Count of valid values (non-null, parseable)
80
+ - valids_p: Percentage of valid values
81
+ - invalids: Count of invalid values
82
+ - invalids_p: Percentage of invalid values
83
+ - distinct: Count of distinct values
84
+ - distinct_p: Percentage of distinct values
85
+ - duplicates: Count of duplicate values
86
+ - duplicates_p: Percentage of duplicate values
87
+ - non_duplicates: Count of non-duplicate (unique) values
88
+ - non_duplicates_p: Percentage of non-duplicate values
89
+ - min: Minimum value
90
+ - max: Maximum value
91
+ - sum: Sum (numeric only)
92
+ - avg: Average/mean (numeric only)
93
+ - stddev: Standard deviation (numeric only)
94
+ - p5, p25, p50, p75, p95: Percentiles (numeric only)
95
+ - zeros, zeros_p: Zero values (numeric only)
96
+ - negatives, negatives_p: Negative values (numeric only)
97
+ - positives, positives_p: Positive values (numeric only)
98
+ - min_length, max_length, avg_length: String length stats
99
+ - zero_length, zero_length_p: Empty strings
100
+ - topk: Top K values with counts
101
+ - histogram: Distribution histogram
102
+ """
103
+ # Column identity
104
+ name: str # PipeRider uses 'name' not 'column_name'
105
+ type: str # PipeRider uses 'type' (generic: string, integer, numeric, datetime, boolean, other)
106
+ schema_type: str = "" # Original database type (VARCHAR, INTEGER, etc.)
107
+
108
+ # Core metrics (PipeRider exact names)
109
+ total: Optional[int] = None # Set from table row_count
110
+ samples: int = 0 # Number of sampled rows
111
+ samples_p: Optional[float] = None # Sampling percentage
112
+
113
+ # Null metrics
114
+ non_nulls: int = 0
115
+ non_nulls_p: Optional[float] = None
116
+ nulls: int = 0
117
+ nulls_p: Optional[float] = None
118
+
119
+ # Validity metrics
120
+ valids: int = 0
121
+ valids_p: Optional[float] = None
122
+ invalids: int = 0
123
+ invalids_p: Optional[float] = None
124
+
125
+ # Distinct/uniqueness metrics
126
+ distinct: int = 0
127
+ distinct_p: Optional[float] = None
128
+ duplicates: int = 0
129
+ duplicates_p: Optional[float] = None
130
+ non_duplicates: int = 0
131
+ non_duplicates_p: Optional[float] = None
132
+
133
+ # Numeric statistics
134
+ min: Optional[float] = None
135
+ max: Optional[float] = None
136
+ sum: Optional[float] = None
137
+ avg: Optional[float] = None
138
+ stddev: Optional[float] = None
139
+
140
+ # Percentiles (numeric)
141
+ p5: Optional[float] = None
142
+ p25: Optional[float] = None
143
+ p50: Optional[float] = None
144
+ p75: Optional[float] = None
145
+ p95: Optional[float] = None
146
+
147
+ # Numeric sign distribution
148
+ zeros: int = 0
149
+ zeros_p: Optional[float] = None
150
+ negatives: int = 0
151
+ negatives_p: Optional[float] = None
152
+ positives: int = 0
153
+ positives_p: Optional[float] = None
154
+
155
+ # String length metrics
156
+ min_length: Optional[int] = None
157
+ max_length: Optional[int] = None
158
+ avg_length: Optional[float] = None
159
+ stddev_length: Optional[float] = None
160
+ zero_length: int = 0
161
+ zero_length_p: Optional[float] = None
162
+ non_zero_length: int = 0
163
+ non_zero_length_p: Optional[float] = None
164
+
165
+ # Boolean metrics
166
+ trues: int = 0
167
+ trues_p: Optional[float] = None
168
+ falses: int = 0
169
+ falses_p: Optional[float] = None
170
+
171
+ # Distribution data (PipeRider format)
172
+ topk: Optional[Dict] = None # {"values": [...], "counts": [...]}
173
+ histogram: Optional[Dict] = None # {"labels": [...], "counts": [...], "bin_edges": [...]}
174
+ histogram_length: Optional[Dict] = None # For string length distribution
175
+
176
+ # Quality alerts (PipeRider format)
177
+ alerts: List[Dict] = field(default_factory=list)
178
+
179
+ # Profiling metadata
180
+ profile_duration: Optional[str] = None # "1.23" seconds
181
+ elapsed_milli: int = 0 # Duration in milliseconds
182
+
183
+ # Legacy aliases for backward compatibility
184
+ @property
185
+ def column_name(self) -> str:
186
+ return self.name
187
+
188
+ @property
189
+ def data_type(self) -> str:
190
+ return self.type
191
+
192
+ @property
193
+ def row_count(self) -> int:
194
+ return self.samples
195
+
196
+ @property
197
+ def null_count(self) -> int:
198
+ return self.nulls
199
+
200
+ @property
201
+ def null_percent(self) -> float:
202
+ return (self.nulls_p or 0.0) * 100
203
+
204
+ @property
205
+ def distinct_count(self) -> int:
206
+ return self.distinct
207
+
208
+ @property
209
+ def distinct_percent(self) -> float:
210
+ return (self.distinct_p or 0.0) * 100
211
+
212
+ @property
213
+ def duration_ms(self) -> int:
214
+ return self.elapsed_milli
215
+
216
+
217
+ @dataclass
218
+ class TableProfile:
219
+ """Profile result for a table."""
220
+ source_name: str
221
+ table_name: str
222
+ connection_name: str
223
+ row_count: int
224
+ column_count: int
225
+ columns: List[ColumnProfile]
226
+ profile_mode: str
227
+ profiled_at: datetime
228
+ duration_ms: int
229
+ alerts: List[Dict] = field(default_factory=list)
230
+ status: str = "success"
231
+ error: Optional[str] = None
232
+
233
+
234
+ @dataclass
235
+ class ProfileExecutionResult:
236
+ """Result of profile execution."""
237
+ tables_profiled: int = 0
238
+ total_rows: int = 0
239
+ total_columns: int = 0
240
+ total_alerts: int = 0
241
+ duration_ms: int = 0
242
+ profiles: List[TableProfile] = field(default_factory=list)
243
+ errors: List[str] = field(default_factory=list)
244
+
245
+
246
+ class ProfileTask(BaseTask):
247
+ """
248
+ DAG-based profiling task for DVT (PipeRider-style).
249
+
250
+ v0.58.1: Uses fast SQL-based profiling queries instead of slow ydata-profiling.
251
+
252
+ Execution flow:
253
+ 1. Parse selectors (--select, --exclude)
254
+ 2. Build execution list (sources + models)
255
+ 3. For each node:
256
+ a. Execute efficient SQL profiling queries
257
+ b. Collect PipeRider-style metrics
258
+ c. Store results in metadata_store.duckdb
259
+ 4. Display summary (PipeRider-style)
260
+ """
261
+
262
+ def __init__(
263
+ self,
264
+ flags: Any,
265
+ runtime_config: RuntimeConfig,
266
+ manifest: Manifest,
267
+ ):
268
+ super().__init__(flags) # BaseTask only takes flags, sets self.args
269
+ self.runtime_config = runtime_config
270
+ self.manifest = manifest
271
+ self.profile_mode = getattr(self.args, "MODE", "explorative") or "explorative"
272
+ self._threads = getattr(self.args, "THREADS", 4) or 4
273
+
274
+ def run(self) -> ProfileExecutionResult:
275
+ """Execute profiling on selected sources and models."""
276
+ start_time = time.time()
277
+ result = ProfileExecutionResult()
278
+
279
+ # Print header with Rich Panel
280
+ if HAS_RICH:
281
+ console.print()
282
+ header_panel = Panel(
283
+ f"[bold cyan]Mode:[/bold cyan] [yellow]{self.profile_mode}[/yellow] | "
284
+ f"[bold cyan]Threads:[/bold cyan] [yellow]{self._threads}[/yellow]",
285
+ title="[bold magenta]DVT Profile - Data Profiling[/bold magenta]",
286
+ subtitle="[dim]PipeRider-style fast SQL profiling[/dim]",
287
+ border_style="magenta",
288
+ box=box.DOUBLE,
289
+ )
290
+ console.print(header_panel)
291
+ console.print()
292
+ else:
293
+ print("\n" + "=" * 60)
294
+ print(" DVT Profile - Data Profiling")
295
+ print(f" Mode: {self.profile_mode} | Threads: {self._threads}")
296
+ print("=" * 60 + "\n")
297
+
298
+ # Get selected nodes
299
+ nodes = self._get_selected_nodes()
300
+
301
+ if not nodes:
302
+ if HAS_RICH:
303
+ console.print("[yellow]No sources or models selected for profiling.[/yellow]")
304
+ console.print("[dim]Use --select to specify targets, e.g.: dvt profile run --select 'source:*'[/dim]")
305
+ else:
306
+ print("No sources or models selected for profiling.")
307
+ return result
308
+
309
+ # Profile with progress display
310
+ if HAS_RICH:
311
+ result = self._profile_with_progress(nodes, result)
312
+ else:
313
+ result = self._profile_without_progress(nodes, result)
314
+
315
+ # Calculate duration
316
+ result.duration_ms = int((time.time() - start_time) * 1000)
317
+
318
+ # Print summary
319
+ self._print_summary(result)
320
+
321
+ return result
322
+
323
+ def _profile_with_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
324
+ """Profile nodes with Rich progress display."""
325
+ with Progress(
326
+ SpinnerColumn(),
327
+ TextColumn("[bold blue]{task.description}"),
328
+ BarColumn(bar_width=40),
329
+ TaskProgressColumn(),
330
+ MofNCompleteColumn(),
331
+ TimeElapsedColumn(),
332
+ console=console,
333
+ ) as progress:
334
+ main_task = progress.add_task("[cyan]Profiling...", total=len(nodes))
335
+
336
+ # Profile each node
337
+ for i, node in enumerate(nodes, 1):
338
+ node_name = self._get_node_display_name(node)
339
+ progress.update(main_task, description=f"[cyan]Profiling[/cyan] [bold]{node_name}[/bold]")
340
+
341
+ profile = self._profile_node(node, i, len(nodes))
342
+ if profile:
343
+ result.profiles.append(profile)
344
+ result.tables_profiled += 1
345
+ result.total_rows += profile.row_count
346
+ result.total_columns += profile.column_count
347
+ result.total_alerts += len(profile.alerts)
348
+ for col in profile.columns:
349
+ result.total_alerts += len(col.alerts)
350
+
351
+ # Store in metadata_store.duckdb
352
+ self._store_profile(profile)
353
+
354
+ # Show result line
355
+ status_icon = "[green]OK[/green]" if profile.status == "success" else "[red]FAIL[/red]"
356
+ console.print(
357
+ f" {status_icon} {node_name} "
358
+ f"[dim]({profile.row_count:,} rows, {profile.column_count} cols, {profile.duration_ms}ms)[/dim]"
359
+ )
360
+
361
+ progress.advance(main_task)
362
+
363
+ return result
364
+
365
+ def _profile_without_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
366
+ """Profile nodes without Rich (fallback)."""
367
+ for i, node in enumerate(nodes, 1):
368
+ node_name = self._get_node_display_name(node)
369
+ print(f" [{i}/{len(nodes)}] Profiling {node_name}...")
370
+
371
+ profile = self._profile_node(node, i, len(nodes))
372
+ if profile:
373
+ result.profiles.append(profile)
374
+ result.tables_profiled += 1
375
+ result.total_rows += profile.row_count
376
+ result.total_columns += profile.column_count
377
+
378
+ self._store_profile(profile)
379
+
380
+ status = "OK" if profile.status == "success" else "FAIL"
381
+ print(f" {status} ({profile.row_count:,} rows, {profile.column_count} cols)")
382
+
383
+ return result
384
+
385
+ def _get_selected_nodes(self) -> List[Any]:
386
+ """Get list of nodes to profile based on selectors."""
387
+ nodes = []
388
+
389
+ # If no selection, default to all sources
390
+ selector = getattr(self.args, "SELECT", None)
391
+ exclude = getattr(self.args, "EXCLUDE", None)
392
+
393
+ if not selector:
394
+ # Default: profile all sources
395
+ for source_id, source in self.manifest.sources.items():
396
+ nodes.append(source)
397
+ else:
398
+ # Parse selection
399
+ for sel in selector:
400
+ if isinstance(sel, tuple):
401
+ for s in sel:
402
+ nodes.extend(self._parse_selector(s))
403
+ else:
404
+ nodes.extend(self._parse_selector(sel))
405
+
406
+ # Apply exclusions
407
+ if exclude:
408
+ excluded = set()
409
+ for exc in exclude:
410
+ if isinstance(exc, tuple):
411
+ for e in exc:
412
+ excluded.update(self._get_excluded_ids(e))
413
+ else:
414
+ excluded.update(self._get_excluded_ids(exc))
415
+ nodes = [n for n in nodes if self._get_node_id(n) not in excluded]
416
+
417
+ return nodes
418
+
419
+ def _parse_selector(self, selector: str) -> List[Any]:
420
+ """Parse a selector string into nodes."""
421
+ nodes = []
422
+
423
+ if selector.startswith("source:"):
424
+ # Source selector: source:* or source:postgres.*
425
+ pattern = selector[7:] # Remove "source:" prefix
426
+ for source_id, source in self.manifest.sources.items():
427
+ if self._matches_pattern(source, pattern):
428
+ nodes.append(source)
429
+
430
+ elif selector.startswith("model:"):
431
+ # Model selector: model:* or model:staging.*
432
+ pattern = selector[6:] # Remove "model:" prefix
433
+ for node_id, node in self.manifest.nodes.items():
434
+ if hasattr(node, "resource_type") and node.resource_type.value == "model":
435
+ if self._matches_pattern(node, pattern):
436
+ nodes.append(node)
437
+
438
+ elif "*" in selector:
439
+ # Wildcard - match both sources and models
440
+ pattern = selector
441
+ for source_id, source in self.manifest.sources.items():
442
+ if self._matches_pattern(source, pattern):
443
+ nodes.append(source)
444
+ for node_id, node in self.manifest.nodes.items():
445
+ if hasattr(node, "resource_type") and node.resource_type.value == "model":
446
+ if self._matches_pattern(node, pattern):
447
+ nodes.append(node)
448
+
449
+ else:
450
+ # Exact match by name
451
+ for source_id, source in self.manifest.sources.items():
452
+ if source.name == selector or source.identifier == selector:
453
+ nodes.append(source)
454
+ for node_id, node in self.manifest.nodes.items():
455
+ if hasattr(node, "name") and node.name == selector:
456
+ nodes.append(node)
457
+
458
+ return nodes
459
+
460
+ def _matches_pattern(self, node: Any, pattern: str) -> bool:
461
+ """Check if a node matches a glob pattern."""
462
+ import fnmatch
463
+
464
+ if pattern == "*":
465
+ return True
466
+
467
+ name = getattr(node, "name", "")
468
+ identifier = getattr(node, "identifier", name)
469
+ source_name = getattr(node, "source_name", "")
470
+ unique_id = getattr(node, "unique_id", "")
471
+
472
+ # Try matching against different attributes
473
+ full_name = f"{source_name}.{identifier}" if source_name else identifier
474
+
475
+ # Extract just the source_name.table portion from unique_id
476
+ # unique_id format: source.project_name.source_name.table_name
477
+ # We want to match against: project_name.source_name.table_name
478
+ parts = unique_id.split(".")
479
+ if len(parts) >= 4 and parts[0] == "source":
480
+ # project_name.source_name.table_name
481
+ project_source_table = ".".join(parts[1:])
482
+ source_table = ".".join(parts[2:]) # source_name.table_name
483
+ else:
484
+ project_source_table = unique_id
485
+ source_table = full_name
486
+
487
+ return (
488
+ fnmatch.fnmatch(name, pattern) or
489
+ fnmatch.fnmatch(identifier, pattern) or
490
+ fnmatch.fnmatch(full_name, pattern) or
491
+ fnmatch.fnmatch(project_source_table, pattern) or
492
+ fnmatch.fnmatch(source_table, pattern) or
493
+ fnmatch.fnmatch(unique_id, pattern)
494
+ )
495
+
496
+ def _get_excluded_ids(self, exclude_str: str) -> Set[str]:
497
+ """Get IDs of nodes matching exclusion pattern."""
498
+ ids = set()
499
+ nodes = self._parse_selector(exclude_str)
500
+ for node in nodes:
501
+ ids.add(self._get_node_id(node))
502
+ return ids
503
+
504
+ def _get_node_id(self, node: Any) -> str:
505
+ """Get unique ID for a node."""
506
+ if hasattr(node, "unique_id"):
507
+ return node.unique_id
508
+ return getattr(node, "name", str(node))
509
+
510
+ def _get_node_display_name(self, node: Any) -> str:
511
+ """Get display name for a node."""
512
+ if isinstance(node, SourceDefinition):
513
+ return f"{node.source_name}.{node.identifier}"
514
+ else:
515
+ return getattr(node, "name", str(node))
516
+
517
+ def _profile_node(self, node: Any, index: int, total: int) -> Optional[TableProfile]:
518
+ """Profile a single node (source or model)."""
519
+ start_time = time.time()
520
+
521
+ # Get node info
522
+ if isinstance(node, SourceDefinition):
523
+ source_name = node.source_name
524
+ table_name = node.identifier
525
+ connection_name = getattr(node, "config", {}).get("target", "default")
526
+ node_type = "source"
527
+ else:
528
+ source_name = "models"
529
+ table_name = node.name
530
+ connection_name = getattr(node.config, "target", "default") if hasattr(node, "config") else "default"
531
+ node_type = "model"
532
+
533
+ try:
534
+ # Execute profiling
535
+ columns = self._execute_profile(node)
536
+
537
+ duration_ms = int((time.time() - start_time) * 1000)
538
+
539
+ # Calculate totals
540
+ row_count = columns[0].row_count if columns else 0
541
+
542
+ # Collect alerts
543
+ alerts = []
544
+ for col in columns:
545
+ alerts.extend(col.alerts)
546
+
547
+ profile = TableProfile(
548
+ source_name=source_name,
549
+ table_name=table_name,
550
+ connection_name=connection_name,
551
+ row_count=row_count,
552
+ column_count=len(columns),
553
+ columns=columns,
554
+ profile_mode=self.profile_mode,
555
+ profiled_at=datetime.now(),
556
+ duration_ms=duration_ms,
557
+ alerts=alerts,
558
+ status="success",
559
+ )
560
+
561
+ return profile
562
+
563
+ except Exception as e:
564
+ duration_ms = int((time.time() - start_time) * 1000)
565
+ return TableProfile(
566
+ source_name=source_name,
567
+ table_name=table_name,
568
+ connection_name=connection_name,
569
+ row_count=0,
570
+ column_count=0,
571
+ columns=[],
572
+ profile_mode=self.profile_mode,
573
+ profiled_at=datetime.now(),
574
+ duration_ms=duration_ms,
575
+ status="error",
576
+ error=str(e),
577
+ )
578
+
579
+ def _execute_profile(self, node: Any) -> List[ColumnProfile]:
580
+ """
581
+ Execute PipeRider-style profiling queries on a node.
582
+
583
+ Uses efficient SQL queries to compute:
584
+ - row_count, null_count, distinct_count
585
+ - min, max, mean, stddev (numeric)
586
+ - min_length, max_length, avg_length (string)
587
+ - top_values (categorical)
588
+ """
589
+ columns = []
590
+
591
+ # Get table info
592
+ if isinstance(node, SourceDefinition):
593
+ schema = node.schema
594
+ table = node.identifier
595
+ database = getattr(node, "database", None)
596
+ target_name = node.config.get("target") if hasattr(node, "config") else None
597
+ else:
598
+ schema = node.schema
599
+ table = node.alias or node.name
600
+ database = getattr(node, "database", None)
601
+ target_name = getattr(node.config, "target", None) if hasattr(node, "config") else None
602
+
603
+ # Get adapter for connection
604
+ adapter = self._get_adapter(target_name)
605
+
606
+ # Get column info - either from node definition or by querying database
607
+ node_columns = getattr(node, "columns", {})
608
+
609
+ if not node_columns:
610
+ # Query database for column info
611
+ column_info = self._get_columns_from_db(adapter, database, schema, table)
612
+ else:
613
+ column_info = [
614
+ (col_name, getattr(col_info, "data_type", "VARCHAR") or "VARCHAR")
615
+ for col_name, col_info in node_columns.items()
616
+ ]
617
+
618
+ if not column_info:
619
+ # Fallback: profile as single row count only
620
+ row_count = self._get_row_count(adapter, database, schema, table)
621
+ return [ColumnProfile(
622
+ name="_table_",
623
+ type="TABLE",
624
+ schema_type="TABLE",
625
+ total=row_count,
626
+ samples=row_count,
627
+ )]
628
+
629
+ # Get row count once for all columns
630
+ row_count = self._get_row_count(adapter, database, schema, table)
631
+
632
+ # Profile columns in parallel using threads
633
+ if self._threads > 1 and len(column_info) > 1:
634
+ with ThreadPoolExecutor(max_workers=min(self._threads, len(column_info))) as executor:
635
+ futures = {
636
+ executor.submit(
637
+ self._profile_column_sql,
638
+ adapter, database, schema, table,
639
+ col_name, col_type, row_count
640
+ ): (col_name, col_type)
641
+ for col_name, col_type in column_info
642
+ }
643
+ for future in as_completed(futures):
644
+ try:
645
+ profile = future.result()
646
+ columns.append(profile)
647
+ except Exception as e:
648
+ col_name, col_type = futures[future]
649
+ columns.append(ColumnProfile(
650
+ name=col_name,
651
+ type=self._classify_type(col_type),
652
+ schema_type=col_type,
653
+ total=row_count,
654
+ samples=row_count,
655
+ alerts=[{"type": "PROFILE_ERROR", "severity": "warning", "message": str(e)[:100]}]
656
+ ))
657
+ else:
658
+ # Sequential profiling
659
+ for col_name, col_type in column_info:
660
+ profile = self._profile_column_sql(
661
+ adapter, database, schema, table,
662
+ col_name, col_type, row_count
663
+ )
664
+ columns.append(profile)
665
+
666
+ return columns
667
+
668
+ def _get_adapter(self, target_name: Optional[str] = None):
669
+ """Get adapter for the specified target or default."""
670
+ from dbt.adapters.factory import get_adapter
671
+
672
+ # Get adapter from runtime config
673
+ adapter = get_adapter(self.runtime_config)
674
+ return adapter
675
+
676
+ def _get_columns_from_db(
677
+ self, adapter, database: Optional[str], schema: str, table: str
678
+ ) -> List[tuple]:
679
+ """Query database to get column names and types."""
680
+ try:
681
+ # Use adapter's get_columns_in_relation
682
+ from dbt.adapters.base import BaseRelation
683
+
684
+ relation = adapter.Relation.create(
685
+ database=database,
686
+ schema=schema,
687
+ identifier=table,
688
+ )
689
+
690
+ with adapter.connection_named("profile"):
691
+ columns = adapter.get_columns_in_relation(relation)
692
+ return [(col.name, col.dtype) for col in columns]
693
+ except Exception:
694
+ return []
695
+
696
+ def _get_row_count(
697
+ self, adapter, database: Optional[str], schema: str, table: str
698
+ ) -> int:
699
+ """Get row count from table."""
700
+ try:
701
+ fqn = self._build_fqn(adapter, database, schema, table)
702
+ sql = f"SELECT COUNT(*) as cnt FROM {fqn}"
703
+
704
+ with adapter.connection_named("profile"):
705
+ _, result = adapter.execute(sql, fetch=True)
706
+ if result and len(result) > 0:
707
+ return int(result[0][0])
708
+ except Exception:
709
+ pass
710
+ return 0
711
+
712
+ def _build_fqn(
713
+ self, adapter, database: Optional[str], schema: str, table: str
714
+ ) -> str:
715
+ """Build fully qualified table name."""
716
+ parts = []
717
+ if database:
718
+ parts.append(adapter.quote(database))
719
+ if schema:
720
+ parts.append(adapter.quote(schema))
721
+ parts.append(adapter.quote(table))
722
+ return ".".join(parts)
723
+
724
+ def _classify_type(self, col_type: str) -> str:
725
+ """Classify database type into PipeRider generic type."""
726
+ col_type_lower = col_type.lower()
727
+
728
+ if any(t in col_type_lower for t in ["int", "bigint", "smallint", "tinyint", "serial"]):
729
+ return "integer"
730
+ elif any(t in col_type_lower for t in ["numeric", "decimal", "float", "double", "real", "number"]):
731
+ return "numeric"
732
+ elif any(t in col_type_lower for t in ["char", "varchar", "text", "string", "clob"]):
733
+ return "string"
734
+ elif any(t in col_type_lower for t in ["date", "time", "timestamp"]):
735
+ return "datetime"
736
+ elif any(t in col_type_lower for t in ["bool", "boolean"]):
737
+ return "boolean"
738
+ else:
739
+ return "other"
740
+
741
+ def _profile_column_sql(
742
+ self, adapter, database: Optional[str], schema: str, table: str,
743
+ col_name: str, col_type: str, row_count: int
744
+ ) -> ColumnProfile:
745
+ """
746
+ Profile a single column using efficient SQL queries.
747
+
748
+ PipeRider-style: Single-pass or minimal queries for all metrics.
749
+ Uses PipeRider metric names: nulls, non_nulls, distinct, valids, etc.
750
+ """
751
+ start_time = time.time()
752
+
753
+ generic_type = self._classify_type(col_type)
754
+ profile = ColumnProfile(
755
+ name=col_name,
756
+ type=generic_type,
757
+ schema_type=col_type,
758
+ total=row_count,
759
+ samples=row_count,
760
+ samples_p=1.0,
761
+ )
762
+
763
+ fqn = self._build_fqn(adapter, database, schema, table)
764
+ quoted_col = adapter.quote(col_name)
765
+
766
+ try:
767
+ # Determine column type category
768
+ col_type_lower = col_type.lower()
769
+ is_numeric = any(t in col_type_lower for t in [
770
+ "int", "numeric", "decimal", "float", "double", "real", "number", "bigint", "smallint"
771
+ ])
772
+ is_string = any(t in col_type_lower for t in [
773
+ "char", "varchar", "text", "string", "clob"
774
+ ])
775
+
776
+ # Build comprehensive profiling query based on column type
777
+ if is_numeric:
778
+ profile = self._profile_numeric_column(
779
+ adapter, fqn, quoted_col, col_name, col_type, row_count
780
+ )
781
+ elif is_string:
782
+ profile = self._profile_string_column(
783
+ adapter, fqn, quoted_col, col_name, col_type, row_count
784
+ )
785
+ else:
786
+ # Default: basic metrics only
787
+ profile = self._profile_basic_column(
788
+ adapter, fqn, quoted_col, col_name, col_type, row_count
789
+ )
790
+
791
+ # Get top values for categorical columns (not in sensitive mode)
792
+ if self.profile_mode != "sensitive":
793
+ if profile.distinct and profile.distinct <= 100:
794
+ self._add_top_values(adapter, fqn, quoted_col, profile)
795
+
796
+ except Exception as e:
797
+ # If SQL fails, return what we have
798
+ profile.alerts.append({
799
+ "type": "PROFILE_ERROR",
800
+ "severity": "warning",
801
+ "message": f"Could not profile column: {str(e)[:100]}",
802
+ })
803
+
804
+ # Generate quality alerts
805
+ profile.alerts.extend(self._generate_alerts(profile))
806
+
807
+ profile.elapsed_milli = int((time.time() - start_time) * 1000)
808
+ profile.profile_duration = f"{(time.time() - start_time):.2f}"
809
+ return profile
810
+
811
+ def _profile_numeric_column(
812
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
813
+ ) -> ColumnProfile:
814
+ """Profile a numeric column with all stats in one query (PipeRider-style)."""
815
+ generic_type = self._classify_type(col_type)
816
+ profile = ColumnProfile(
817
+ name=col_name,
818
+ type=generic_type,
819
+ schema_type=col_type,
820
+ total=row_count,
821
+ samples=row_count,
822
+ samples_p=1.0,
823
+ )
824
+
825
+ # Single comprehensive query for numeric columns (PipeRider-style)
826
+ sql = f"""
827
+ SELECT
828
+ COUNT(*) - COUNT({quoted_col}) as nulls,
829
+ COUNT({quoted_col}) as non_nulls,
830
+ COUNT(DISTINCT {quoted_col}) as distinct_val,
831
+ MIN({quoted_col}) as min_val,
832
+ MAX({quoted_col}) as max_val,
833
+ SUM(CAST({quoted_col} AS DOUBLE PRECISION)) as sum_val,
834
+ AVG(CAST({quoted_col} AS DOUBLE PRECISION)) as avg_val,
835
+ STDDEV(CAST({quoted_col} AS DOUBLE PRECISION)) as stddev_val,
836
+ SUM(CASE WHEN {quoted_col} = 0 THEN 1 ELSE 0 END) as zeros,
837
+ SUM(CASE WHEN {quoted_col} < 0 THEN 1 ELSE 0 END) as negatives,
838
+ SUM(CASE WHEN {quoted_col} > 0 THEN 1 ELSE 0 END) as positives
839
+ FROM {fqn}
840
+ """
841
+
842
+ try:
843
+ with adapter.connection_named("profile"):
844
+ _, result = adapter.execute(sql, fetch=True)
845
+ if result and len(result) > 0:
846
+ row = result[0]
847
+ # PipeRider-style metric names
848
+ profile.nulls = int(row[0] or 0)
849
+ profile.non_nulls = int(row[1] or 0)
850
+ profile.distinct = int(row[2] or 0)
851
+ profile.min = float(row[3]) if row[3] is not None else None
852
+ profile.max = float(row[4]) if row[4] is not None else None
853
+ profile.sum = float(row[5]) if row[5] is not None else None
854
+ profile.avg = float(row[6]) if row[6] is not None else None
855
+ profile.stddev = float(row[7]) if row[7] is not None else None
856
+ profile.zeros = int(row[8] or 0)
857
+ profile.negatives = int(row[9] or 0)
858
+ profile.positives = int(row[10] or 0)
859
+
860
+ # Calculate percentages (PipeRider-style with decimal 0-1)
861
+ if row_count > 0:
862
+ profile.nulls_p = profile.nulls / row_count
863
+ profile.non_nulls_p = profile.non_nulls / row_count
864
+ profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
865
+ profile.zeros_p = profile.zeros / row_count
866
+ profile.negatives_p = profile.negatives / row_count
867
+ profile.positives_p = profile.positives / row_count
868
+
869
+ # Validity metrics (for numeric, valid = non-null)
870
+ profile.valids = profile.non_nulls
871
+ profile.valids_p = profile.non_nulls_p
872
+ profile.invalids = profile.nulls
873
+ profile.invalids_p = profile.nulls_p
874
+
875
+ # Duplicate metrics
876
+ if profile.non_nulls > 0 and profile.distinct > 0:
877
+ profile.non_duplicates = profile.distinct
878
+ profile.duplicates = profile.non_nulls - profile.distinct
879
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
880
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
881
+
882
+ # Try to get percentiles for explorative mode
883
+ if self.profile_mode in ["explorative", "time-series"]:
884
+ self._add_percentiles(adapter, fqn, quoted_col, profile)
885
+
886
+ except Exception:
887
+ # Fall back to basic profile
888
+ profile = self._profile_basic_column(
889
+ adapter, fqn, quoted_col, col_name, col_type, row_count
890
+ )
891
+
892
+ return profile
893
+
894
+ def _profile_string_column(
895
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
896
+ ) -> ColumnProfile:
897
+ """Profile a string column with all stats in one query (PipeRider-style)."""
898
+ generic_type = self._classify_type(col_type)
899
+ profile = ColumnProfile(
900
+ name=col_name,
901
+ type=generic_type,
902
+ schema_type=col_type,
903
+ total=row_count,
904
+ samples=row_count,
905
+ samples_p=1.0,
906
+ )
907
+
908
+ # Single comprehensive query for string columns (PipeRider-style)
909
+ sql = f"""
910
+ SELECT
911
+ COUNT(*) - COUNT({quoted_col}) as nulls,
912
+ COUNT({quoted_col}) as non_nulls,
913
+ COUNT(DISTINCT {quoted_col}) as distinct_val,
914
+ MIN(LENGTH({quoted_col})) as min_len,
915
+ MAX(LENGTH({quoted_col})) as max_len,
916
+ AVG(LENGTH({quoted_col})) as avg_len,
917
+ SUM(CASE WHEN LENGTH({quoted_col}) = 0 THEN 1 ELSE 0 END) as zero_length_count
918
+ FROM {fqn}
919
+ """
920
+
921
+ try:
922
+ with adapter.connection_named("profile"):
923
+ _, result = adapter.execute(sql, fetch=True)
924
+ if result and len(result) > 0:
925
+ row = result[0]
926
+ # PipeRider-style metric names
927
+ profile.nulls = int(row[0] or 0)
928
+ profile.non_nulls = int(row[1] or 0)
929
+ profile.distinct = int(row[2] or 0)
930
+ profile.min_length = int(row[3]) if row[3] is not None else None
931
+ profile.max_length = int(row[4]) if row[4] is not None else None
932
+ profile.avg_length = float(row[5]) if row[5] is not None else None
933
+ profile.zero_length = int(row[6] or 0)
934
+
935
+ # Calculate percentages (PipeRider-style with decimal 0-1)
936
+ if row_count > 0:
937
+ profile.nulls_p = profile.nulls / row_count
938
+ profile.non_nulls_p = profile.non_nulls / row_count
939
+ profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
940
+ profile.zero_length_p = profile.zero_length / row_count
941
+
942
+ # Validity metrics (for string, valid = non-null non-empty)
943
+ profile.valids = profile.non_nulls - profile.zero_length
944
+ profile.invalids = profile.nulls + profile.zero_length
945
+ if row_count > 0:
946
+ profile.valids_p = profile.valids / row_count
947
+ profile.invalids_p = profile.invalids / row_count
948
+
949
+ # Non-zero length
950
+ profile.non_zero_length = profile.non_nulls - profile.zero_length
951
+ if profile.non_nulls > 0:
952
+ profile.non_zero_length_p = profile.non_zero_length / profile.non_nulls
953
+
954
+ # Duplicate metrics
955
+ if profile.non_nulls > 0 and profile.distinct > 0:
956
+ profile.non_duplicates = profile.distinct
957
+ profile.duplicates = profile.non_nulls - profile.distinct
958
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
959
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
960
+
961
+ except Exception:
962
+ # Fall back to basic profile
963
+ profile = self._profile_basic_column(
964
+ adapter, fqn, quoted_col, col_name, col_type, row_count
965
+ )
966
+
967
+ return profile
968
+
969
+ def _profile_basic_column(
970
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str, row_count: int
971
+ ) -> ColumnProfile:
972
+ """Profile any column with basic metrics only (PipeRider-style)."""
973
+ generic_type = self._classify_type(col_type)
974
+ profile = ColumnProfile(
975
+ name=col_name,
976
+ type=generic_type,
977
+ schema_type=col_type,
978
+ total=row_count,
979
+ samples=row_count,
980
+ samples_p=1.0,
981
+ )
982
+
983
+ sql = f"""
984
+ SELECT
985
+ COUNT(*) - COUNT({quoted_col}) as nulls,
986
+ COUNT({quoted_col}) as non_nulls,
987
+ COUNT(DISTINCT {quoted_col}) as distinct_val
988
+ FROM {fqn}
989
+ """
990
+
991
+ try:
992
+ with adapter.connection_named("profile"):
993
+ _, result = adapter.execute(sql, fetch=True)
994
+ if result and len(result) > 0:
995
+ profile.nulls = int(result[0][0] or 0)
996
+ profile.non_nulls = int(result[0][1] or 0)
997
+ profile.distinct = int(result[0][2] or 0)
998
+
999
+ # Calculate percentages (PipeRider-style with decimal 0-1)
1000
+ if row_count > 0:
1001
+ profile.nulls_p = profile.nulls / row_count
1002
+ profile.non_nulls_p = profile.non_nulls / row_count
1003
+ profile.distinct_p = profile.distinct / row_count if profile.non_nulls > 0 else None
1004
+
1005
+ # Validity metrics
1006
+ profile.valids = profile.non_nulls
1007
+ profile.valids_p = profile.non_nulls_p
1008
+ profile.invalids = profile.nulls
1009
+ profile.invalids_p = profile.nulls_p
1010
+
1011
+ # Duplicate metrics
1012
+ if profile.non_nulls > 0 and profile.distinct > 0:
1013
+ profile.non_duplicates = profile.distinct
1014
+ profile.duplicates = profile.non_nulls - profile.distinct
1015
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
1016
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
1017
+
1018
+ except Exception:
1019
+ pass
1020
+
1021
+ return profile
1022
+
1023
+ def _add_percentiles(self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile) -> None:
1024
+ """Try to add percentiles to numeric profile."""
1025
+ try:
1026
+ # Try PostgreSQL/Redshift style
1027
+ percentile_sql = f"""
1028
+ SELECT
1029
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {quoted_col}) as p25,
1030
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY {quoted_col}) as p50,
1031
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {quoted_col}) as p75
1032
+ FROM {fqn}
1033
+ WHERE {quoted_col} IS NOT NULL
1034
+ """
1035
+ with adapter.connection_named("profile"):
1036
+ _, result = adapter.execute(percentile_sql, fetch=True)
1037
+ if result and len(result) > 0:
1038
+ row = result[0]
1039
+ profile.p25 = float(row[0]) if row[0] is not None else None
1040
+ profile.p50 = float(row[1]) if row[1] is not None else None
1041
+ profile.p75 = float(row[2]) if row[2] is not None else None
1042
+ profile.median_value = profile.p50
1043
+ except Exception:
1044
+ # Percentiles not supported on this database
1045
+ pass
1046
+
1047
+ def _add_top_values(self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile) -> None:
1048
+ """Add top values to profile (PipeRider topk format)."""
1049
+ try:
1050
+ top_sql = f"""
1051
+ SELECT {quoted_col} as val, COUNT(*) as cnt
1052
+ FROM {fqn}
1053
+ WHERE {quoted_col} IS NOT NULL
1054
+ GROUP BY {quoted_col}
1055
+ ORDER BY cnt DESC
1056
+ LIMIT 10
1057
+ """
1058
+ with adapter.connection_named("profile"):
1059
+ _, result = adapter.execute(top_sql, fetch=True)
1060
+ if result:
1061
+ # PipeRider topk format: {"values": [...], "counts": [...]}
1062
+ values = [str(row[0]) for row in result]
1063
+ counts = [int(row[1]) for row in result]
1064
+ profile.topk = {
1065
+ "values": values,
1066
+ "counts": counts,
1067
+ }
1068
+ except Exception:
1069
+ pass
1070
+
1071
+ def _generate_alerts(self, profile: ColumnProfile) -> List[Dict]:
1072
+ """
1073
+ Generate quality alerts for a column profile (PipeRider-style).
1074
+
1075
+ PipeRider alert types (from piperider_cli/profiler/event.py):
1076
+ - missing_value: High percentage of null/missing values
1077
+ - high_distinct: Very high cardinality (possible PK)
1078
+ - low_distinct: Very low cardinality (possible boolean/flag)
1079
+ - all_null: 100% null values
1080
+ - constant: All values are the same
1081
+ - negative_value: Has negative values in numeric column
1082
+ - zero_length_string: Has empty strings
1083
+ """
1084
+ alerts = []
1085
+
1086
+ # Get null percentage (as 0-100 for comparison)
1087
+ nulls_pct = (profile.nulls_p or 0) * 100 if profile.nulls_p is not None else 0
1088
+ distinct_pct = (profile.distinct_p or 0) * 100 if profile.distinct_p is not None else 0
1089
+
1090
+ # High null rate alert (PipeRider: missing_value)
1091
+ if nulls_pct > 50:
1092
+ alerts.append({
1093
+ "type": "missing_value",
1094
+ "severity": "error",
1095
+ "column": profile.name,
1096
+ "message": f"Column has {nulls_pct:.1f}% null values (>50%)",
1097
+ })
1098
+ elif nulls_pct > 20:
1099
+ alerts.append({
1100
+ "type": "missing_value",
1101
+ "severity": "warning",
1102
+ "column": profile.name,
1103
+ "message": f"Column has {nulls_pct:.1f}% null values",
1104
+ })
1105
+
1106
+ # High cardinality alert (PipeRider: high_distinct)
1107
+ if distinct_pct > 99 and profile.samples > 100:
1108
+ alerts.append({
1109
+ "type": "high_distinct",
1110
+ "severity": "info",
1111
+ "column": profile.name,
1112
+ "message": f"Column is {distinct_pct:.1f}% unique (possible primary key)",
1113
+ })
1114
+
1115
+ # Low cardinality (PipeRider: low_distinct)
1116
+ if profile.distinct and profile.distinct < 10 and profile.samples > 1000:
1117
+ alerts.append({
1118
+ "type": "low_distinct",
1119
+ "severity": "info",
1120
+ "column": profile.name,
1121
+ "message": f"Column has only {profile.distinct} distinct values (possible category)",
1122
+ })
1123
+
1124
+ # All nulls alert (PipeRider: all_null)
1125
+ if nulls_pct >= 100 or (profile.non_nulls == 0 and profile.nulls > 0):
1126
+ alerts.append({
1127
+ "type": "all_null",
1128
+ "severity": "error",
1129
+ "column": profile.name,
1130
+ "message": "Column is 100% null - consider removing",
1131
+ })
1132
+
1133
+ # Zero variance / Constant alert (PipeRider: constant)
1134
+ if profile.min is not None and profile.max is not None:
1135
+ if profile.min == profile.max and profile.distinct == 1:
1136
+ alerts.append({
1137
+ "type": "constant",
1138
+ "severity": "warning",
1139
+ "column": profile.name,
1140
+ "message": f"Column has constant value: {profile.min}",
1141
+ })
1142
+
1143
+ # Negative values (PipeRider: negative_value) - informational only
1144
+ if profile.negatives and profile.negatives > 0:
1145
+ negatives_pct = (profile.negatives_p or 0) * 100
1146
+ if negatives_pct > 50:
1147
+ alerts.append({
1148
+ "type": "negative_value",
1149
+ "severity": "info",
1150
+ "column": profile.name,
1151
+ "message": f"Column has {negatives_pct:.1f}% negative values",
1152
+ })
1153
+
1154
+ # Zero-length strings (PipeRider: zero_length_string)
1155
+ if profile.zero_length and profile.zero_length > 0:
1156
+ zero_len_pct = (profile.zero_length_p or 0) * 100
1157
+ if zero_len_pct > 10:
1158
+ alerts.append({
1159
+ "type": "zero_length_string",
1160
+ "severity": "warning",
1161
+ "column": profile.name,
1162
+ "message": f"Column has {zero_len_pct:.1f}% empty strings",
1163
+ })
1164
+
1165
+ return alerts
1166
+
1167
+ def _store_profile(self, profile: TableProfile) -> None:
1168
+ """Store profile results in metadata_store.duckdb."""
1169
+ try:
1170
+ # Check if DuckDB is available
1171
+ try:
1172
+ import duckdb
1173
+ except ImportError:
1174
+ if HAS_RICH:
1175
+ console.print("[yellow]Warning: DuckDB not available. Profile results will not be persisted.[/yellow]")
1176
+ return
1177
+
1178
+ from dbt.compute.metadata import ProjectMetadataStore, ColumnProfileResult
1179
+
1180
+ project_root = Path(self.runtime_config.project_root)
1181
+ store = ProjectMetadataStore(project_root)
1182
+ store.initialize()
1183
+
1184
+ for col in profile.columns:
1185
+ result = ColumnProfileResult(
1186
+ source_name=profile.source_name,
1187
+ table_name=profile.table_name,
1188
+ column_name=col.column_name,
1189
+ profile_mode=profile.profile_mode,
1190
+ row_count=col.row_count,
1191
+ null_count=col.null_count,
1192
+ null_percent=col.null_percent,
1193
+ distinct_count=col.distinct_count,
1194
+ distinct_percent=col.distinct_percent,
1195
+ min_value=col.min_value,
1196
+ max_value=col.max_value,
1197
+ mean_value=col.mean_value,
1198
+ median_value=col.median_value,
1199
+ stddev_value=col.stddev_value,
1200
+ p25=col.p25,
1201
+ p50=col.p50,
1202
+ p75=col.p75,
1203
+ min_length=col.min_length,
1204
+ max_length=col.max_length,
1205
+ avg_length=col.avg_length,
1206
+ histogram=json.dumps(col.histogram) if col.histogram else None,
1207
+ top_values=json.dumps(col.top_values) if col.top_values else None,
1208
+ alerts=json.dumps(col.alerts) if col.alerts else None,
1209
+ profiled_at=profile.profiled_at,
1210
+ duration_ms=col.duration_ms,
1211
+ )
1212
+ store.save_profile_result(result)
1213
+
1214
+ store.close()
1215
+
1216
+ except Exception as e:
1217
+ # Log but don't fail if storage fails
1218
+ if HAS_RICH:
1219
+ console.print(f"[yellow]Warning: Could not store profile results: {e}[/yellow]")
1220
+
1221
+ def _print_summary(self, result: ProfileExecutionResult) -> None:
1222
+ """Print PipeRider-style summary with Rich formatting."""
1223
+ if not HAS_RICH:
1224
+ print("\n" + "=" * 60)
1225
+ print(" SUMMARY")
1226
+ print(f" Tables profiled: {result.tables_profiled}")
1227
+ print(f" Total rows: {result.total_rows:,}")
1228
+ print(f" Total columns: {result.total_columns}")
1229
+ print(f" Alerts: {result.total_alerts}")
1230
+ print(f" Duration: {result.duration_ms / 1000:.1f}s")
1231
+ print("=" * 60 + "\n")
1232
+ return
1233
+
1234
+ console.print()
1235
+
1236
+ # Summary panel
1237
+ summary_lines = [
1238
+ f"[bold]Tables profiled:[/bold] {result.tables_profiled}",
1239
+ f"[bold]Total rows:[/bold] {result.total_rows:,}",
1240
+ f"[bold]Total columns:[/bold] {result.total_columns}",
1241
+ ]
1242
+
1243
+ if result.total_alerts > 0:
1244
+ summary_lines.append(f"[bold yellow]Alerts:[/bold yellow] {result.total_alerts}")
1245
+ else:
1246
+ summary_lines.append(f"[bold green]Alerts:[/bold green] 0")
1247
+
1248
+ summary_lines.append(f"[dim]Duration:[/dim] {result.duration_ms / 1000:.1f}s")
1249
+
1250
+ console.print(Panel(
1251
+ "\n".join(summary_lines),
1252
+ title="[bold cyan]Summary[/bold cyan]",
1253
+ border_style="cyan",
1254
+ box=box.ROUNDED,
1255
+ ))
1256
+
1257
+ # List alerts if any
1258
+ if result.total_alerts > 0:
1259
+ console.print()
1260
+ console.print("[bold yellow]Alerts:[/bold yellow]")
1261
+ console.print()
1262
+
1263
+ alerts_table = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
1264
+ alerts_table.add_column("Severity", style="bold", width=8)
1265
+ alerts_table.add_column("Type", style="cyan", width=15)
1266
+ alerts_table.add_column("Location", style="white", width=30)
1267
+ alerts_table.add_column("Message", style="dim")
1268
+
1269
+ for profile in result.profiles:
1270
+ for col in profile.columns:
1271
+ for alert in col.alerts:
1272
+ if alert["severity"] == "error":
1273
+ sev_display = "[red]ERROR[/red]"
1274
+ elif alert["severity"] == "warning":
1275
+ sev_display = "[yellow]WARN[/yellow]"
1276
+ else:
1277
+ sev_display = "[blue]INFO[/blue]"
1278
+
1279
+ location = f"{profile.table_name}.{col.column_name}"
1280
+ alerts_table.add_row(
1281
+ sev_display,
1282
+ alert["type"],
1283
+ location,
1284
+ alert["message"]
1285
+ )
1286
+
1287
+ console.print(alerts_table)
1288
+
1289
+ console.print()
1290
+
1291
+ # Success footer
1292
+ if result.tables_profiled > 0:
1293
+ console.print("[bold green]Profiling complete![/bold green]")
1294
+ console.print()
1295
+ console.print("[cyan]Results saved to:[/cyan] [bold].dvt/metadata_store.duckdb[/bold]")
1296
+ console.print("[dim]View report: dvt profile serve[/dim]")
1297
+ else:
1298
+ console.print("[yellow]No tables were profiled.[/yellow]")
1299
+
1300
+ console.print()
1301
+
1302
+ def interpret_results(self, result: ProfileExecutionResult) -> bool:
1303
+ """Interpret results to determine success/failure."""
1304
+ if not result.profiles:
1305
+ return False
1306
+ # Success if at least one profile completed
1307
+ return any(p.status == "success" for p in result.profiles)