dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
dbt/task/profile.py ADDED
@@ -0,0 +1,1489 @@
1
+ """
2
+ DVT Profile Task
3
+
4
+ Data profiling task with DAG-based execution for sources and models.
5
+ Works like 'dvt run' with full selector support and DVT compute rules.
6
+
7
+ v0.56.0: Initial implementation with 4 profiling modes.
8
+ v0.58.1: PipeRider-style profiling - fast SQL-based metrics instead of ydata-profiling.
9
+ v0.58.7: Simplified to single comprehensive mode, added --sample flag for row sampling.
10
+
11
+ Features:
12
+ - Profiles ALL columns with comprehensive metrics
13
+ - Supports sampling: --sample 10000 (row count) or --sample 10% (percentage)
14
+ - PipeRider-style CLI output with column details
15
+ - Stores results in .dvt/metadata_store.duckdb
16
+
17
+ PipeRider-Style Metrics:
18
+ - row_count, column_count
19
+ - null_count, null_percent
20
+ - distinct_count, distinct_percent
21
+ - min, max, mean, median, stddev
22
+ - top_values (most frequent)
23
+ - percentiles (p25, p50, p75)
24
+ """
25
+
26
+ import json
27
+ import time
28
+ from concurrent.futures import ThreadPoolExecutor, as_completed
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from typing import Any, Dict, List, Optional, Set
33
+
34
+ import click
35
+
36
+ # Try to import Rich for beautiful CLI output
37
+ try:
38
+ from rich.console import Console
39
+ from rich.progress import (
40
+ Progress,
41
+ TextColumn,
42
+ BarColumn,
43
+ MofNCompleteColumn,
44
+ TimeElapsedColumn,
45
+ SpinnerColumn,
46
+ TaskProgressColumn,
47
+ )
48
+ from rich.table import Table
49
+ from rich import box
50
+ from rich.panel import Panel
51
+ from rich.style import Style
52
+ from rich.live import Live
53
+ HAS_RICH = True
54
+ except ImportError:
55
+ HAS_RICH = False
56
+
57
+ from dbt.artifacts.schemas.run import RunStatus
58
+ from dbt.config.runtime import RuntimeConfig
59
+ from dbt.contracts.graph.manifest import Manifest
60
+ from dbt.contracts.graph.nodes import SourceDefinition, ModelNode
61
+ from dbt.task.base import BaseTask
62
+
63
+ # Initialize Rich console
64
+ console = Console() if HAS_RICH else None
65
+
66
+
67
+ @dataclass
68
+ class ColumnProfile:
69
+ """
70
+ Profile result for a single column (PipeRider-style metrics).
71
+
72
+ PipeRider Metric Names (exact copy from piperider_cli/profiler/profiler.py):
73
+ - total: Total row count in table
74
+ - samples: Number of sampled rows (same as total if no sampling)
75
+ - samples_p: Sampling percentage (1.0 = 100%)
76
+ - non_nulls: Count of non-null values
77
+ - non_nulls_p: Percentage of non-null values
78
+ - nulls: Count of null values
79
+ - nulls_p: Percentage of null values
80
+ - valids: Count of valid values (non-null, parseable)
81
+ - valids_p: Percentage of valid values
82
+ - invalids: Count of invalid values
83
+ - invalids_p: Percentage of invalid values
84
+ - distinct: Count of distinct values
85
+ - distinct_p: Percentage of distinct values
86
+ - duplicates: Count of duplicate values
87
+ - duplicates_p: Percentage of duplicate values
88
+ - non_duplicates: Count of non-duplicate (unique) values
89
+ - non_duplicates_p: Percentage of non-duplicate values
90
+ - min: Minimum value
91
+ - max: Maximum value
92
+ - sum: Sum (numeric only)
93
+ - avg: Average/mean (numeric only)
94
+ - stddev: Standard deviation (numeric only)
95
+ - p5, p25, p50, p75, p95: Percentiles (numeric only)
96
+ - zeros, zeros_p: Zero values (numeric only)
97
+ - negatives, negatives_p: Negative values (numeric only)
98
+ - positives, positives_p: Positive values (numeric only)
99
+ - min_length, max_length, avg_length: String length stats
100
+ - zero_length, zero_length_p: Empty strings
101
+ - topk: Top K values with counts
102
+ - histogram: Distribution histogram
103
+ """
104
+ # Column identity
105
+ name: str # PipeRider uses 'name' not 'column_name'
106
+ type: str # PipeRider uses 'type' (generic: string, integer, numeric, datetime, boolean, other)
107
+ schema_type: str = "" # Original database type (VARCHAR, INTEGER, etc.)
108
+
109
+ # Core metrics (PipeRider exact names)
110
+ total: Optional[int] = None # Set from table row_count
111
+ samples: int = 0 # Number of sampled rows
112
+ samples_p: Optional[float] = None # Sampling percentage
113
+
114
+ # Null metrics
115
+ non_nulls: int = 0
116
+ non_nulls_p: Optional[float] = None
117
+ nulls: int = 0
118
+ nulls_p: Optional[float] = None
119
+
120
+ # Validity metrics
121
+ valids: int = 0
122
+ valids_p: Optional[float] = None
123
+ invalids: int = 0
124
+ invalids_p: Optional[float] = None
125
+
126
+ # Distinct/uniqueness metrics
127
+ distinct: int = 0
128
+ distinct_p: Optional[float] = None
129
+ duplicates: int = 0
130
+ duplicates_p: Optional[float] = None
131
+ non_duplicates: int = 0
132
+ non_duplicates_p: Optional[float] = None
133
+
134
+ # Numeric statistics
135
+ min: Optional[float] = None
136
+ max: Optional[float] = None
137
+ sum: Optional[float] = None
138
+ avg: Optional[float] = None
139
+ stddev: Optional[float] = None
140
+
141
+ # Percentiles (numeric)
142
+ p5: Optional[float] = None
143
+ p25: Optional[float] = None
144
+ p50: Optional[float] = None
145
+ p75: Optional[float] = None
146
+ p95: Optional[float] = None
147
+
148
+ # Numeric sign distribution
149
+ zeros: int = 0
150
+ zeros_p: Optional[float] = None
151
+ negatives: int = 0
152
+ negatives_p: Optional[float] = None
153
+ positives: int = 0
154
+ positives_p: Optional[float] = None
155
+
156
+ # String length metrics
157
+ min_length: Optional[int] = None
158
+ max_length: Optional[int] = None
159
+ avg_length: Optional[float] = None
160
+ stddev_length: Optional[float] = None
161
+ zero_length: int = 0
162
+ zero_length_p: Optional[float] = None
163
+ non_zero_length: int = 0
164
+ non_zero_length_p: Optional[float] = None
165
+
166
+ # Boolean metrics
167
+ trues: int = 0
168
+ trues_p: Optional[float] = None
169
+ falses: int = 0
170
+ falses_p: Optional[float] = None
171
+
172
+ # Distribution data (PipeRider format)
173
+ topk: Optional[Dict] = None # {"values": [...], "counts": [...]}
174
+ histogram: Optional[Dict] = None # {"labels": [...], "counts": [...], "bin_edges": [...]}
175
+ histogram_length: Optional[Dict] = None # For string length distribution
176
+
177
+ # Quality alerts (PipeRider format)
178
+ alerts: List[Dict] = field(default_factory=list)
179
+
180
+ # Profiling metadata
181
+ profile_duration: Optional[str] = None # "1.23" seconds
182
+ elapsed_milli: int = 0 # Duration in milliseconds
183
+
184
+ # Legacy aliases for backward compatibility
185
+ @property
186
+ def column_name(self) -> str:
187
+ return self.name
188
+
189
+ @property
190
+ def data_type(self) -> str:
191
+ return self.type
192
+
193
+ @property
194
+ def row_count(self) -> int:
195
+ return self.samples
196
+
197
+ @property
198
+ def null_count(self) -> int:
199
+ return self.nulls
200
+
201
+ @property
202
+ def null_percent(self) -> float:
203
+ return (self.nulls_p or 0.0) * 100
204
+
205
+ @property
206
+ def distinct_count(self) -> int:
207
+ return self.distinct
208
+
209
+ @property
210
+ def distinct_percent(self) -> float:
211
+ return (self.distinct_p or 0.0) * 100
212
+
213
+ @property
214
+ def duration_ms(self) -> int:
215
+ return self.elapsed_milli
216
+
217
+
218
+ @dataclass
219
+ class TableProfile:
220
+ """Profile result for a table."""
221
+ source_name: str
222
+ table_name: str
223
+ connection_name: str
224
+ row_count: int
225
+ column_count: int
226
+ columns: List[ColumnProfile]
227
+ profile_mode: str
228
+ profiled_at: datetime
229
+ duration_ms: int
230
+ alerts: List[Dict] = field(default_factory=list)
231
+ status: str = "success"
232
+ error: Optional[str] = None
233
+
234
+
235
+ @dataclass
236
+ class ProfileExecutionResult:
237
+ """Result of profile execution."""
238
+ tables_profiled: int = 0
239
+ total_rows: int = 0
240
+ total_columns: int = 0
241
+ total_alerts: int = 0
242
+ duration_ms: int = 0
243
+ profiles: List[TableProfile] = field(default_factory=list)
244
+ errors: List[str] = field(default_factory=list)
245
+
246
+
247
+ class ProfileTask(BaseTask):
248
+ """
249
+ DAG-based profiling task for DVT (PipeRider-style).
250
+
251
+ v0.58.1: Uses fast SQL-based profiling queries instead of slow ydata-profiling.
252
+
253
+ Execution flow:
254
+ 1. Parse selectors (--select, --exclude)
255
+ 2. Build execution list (sources + models)
256
+ 3. For each node:
257
+ a. Execute efficient SQL profiling queries
258
+ b. Collect PipeRider-style metrics
259
+ c. Store results in metadata_store.duckdb
260
+ 4. Display summary (PipeRider-style)
261
+ """
262
+
263
+ def __init__(
264
+ self,
265
+ flags: Any,
266
+ runtime_config: RuntimeConfig,
267
+ manifest: Manifest,
268
+ ):
269
+ super().__init__(flags) # BaseTask only takes flags, sets self.args
270
+ self.runtime_config = runtime_config
271
+ self.manifest = manifest
272
+ # v0.58.7: Use lowercase parameter names (fixed from uppercase)
273
+ self._sample_str = getattr(self.args, "sample", None)
274
+ self._threads = getattr(self.args, "threads", 4) or 4
275
+
276
+ def _parse_sample(self, sample_str: Optional[str], total_rows: int) -> int:
277
+ """Parse sample string into row count.
278
+
279
+ Args:
280
+ sample_str: Sample specification (e.g., "10000" or "10%")
281
+ total_rows: Total rows in the table
282
+
283
+ Returns:
284
+ Number of rows to sample
285
+ """
286
+ if not sample_str:
287
+ return total_rows
288
+ sample_str = sample_str.strip()
289
+ if sample_str.endswith('%'):
290
+ pct = float(sample_str[:-1]) / 100
291
+ return max(1, int(total_rows * pct))
292
+ return min(int(sample_str), total_rows)
293
+
294
+ def run(self) -> ProfileExecutionResult:
295
+ """Execute profiling on selected sources and models."""
296
+ start_time = time.time()
297
+ result = ProfileExecutionResult()
298
+
299
+ # Build sample display string
300
+ sample_display = self._sample_str if self._sample_str else "all rows"
301
+
302
+ # Print header with Rich Panel
303
+ if HAS_RICH:
304
+ console.print()
305
+ header_panel = Panel(
306
+ f"[bold cyan]Sample:[/bold cyan] [yellow]{sample_display}[/yellow] | "
307
+ f"[bold cyan]Threads:[/bold cyan] [yellow]{self._threads}[/yellow]",
308
+ title="[bold magenta]DVT Profile - Data Profiling[/bold magenta]",
309
+ subtitle="[dim]PipeRider-style fast SQL profiling[/dim]",
310
+ border_style="magenta",
311
+ box=box.DOUBLE,
312
+ )
313
+ console.print(header_panel)
314
+ console.print()
315
+ else:
316
+ print("\n" + "=" * 60)
317
+ print(" DVT Profile - Data Profiling")
318
+ print(f" Sample: {sample_display} | Threads: {self._threads}")
319
+ print("=" * 60 + "\n")
320
+
321
+ # Get selected nodes
322
+ nodes = self._get_selected_nodes()
323
+
324
+ if not nodes:
325
+ if HAS_RICH:
326
+ console.print("[yellow]No sources or models selected for profiling.[/yellow]")
327
+ console.print("[dim]Use --select to specify targets, e.g.: dvt profile run --select 'source:*'[/dim]")
328
+ else:
329
+ print("No sources or models selected for profiling.")
330
+ return result
331
+
332
+ # Profile with progress display
333
+ if HAS_RICH:
334
+ result = self._profile_with_progress(nodes, result)
335
+ else:
336
+ result = self._profile_without_progress(nodes, result)
337
+
338
+ # Calculate duration
339
+ result.duration_ms = int((time.time() - start_time) * 1000)
340
+
341
+ # Print summary
342
+ self._print_summary(result)
343
+
344
+ return result
345
+
346
+ def _profile_with_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
347
+ """Profile nodes with Rich progress display."""
348
+ with Progress(
349
+ SpinnerColumn(),
350
+ TextColumn("[bold blue]{task.description}"),
351
+ BarColumn(bar_width=40),
352
+ TaskProgressColumn(),
353
+ MofNCompleteColumn(),
354
+ TimeElapsedColumn(),
355
+ console=console,
356
+ ) as progress:
357
+ main_task = progress.add_task("[cyan]Profiling...", total=len(nodes))
358
+
359
+ # Profile each node
360
+ for i, node in enumerate(nodes, 1):
361
+ node_name = self._get_node_display_name(node)
362
+ progress.update(main_task, description=f"[cyan]Profiling[/cyan] [bold]{node_name}[/bold]")
363
+
364
+ profile = self._profile_node(node, i, len(nodes))
365
+ if profile:
366
+ result.profiles.append(profile)
367
+ result.tables_profiled += 1
368
+ result.total_rows += profile.row_count
369
+ result.total_columns += profile.column_count
370
+ result.total_alerts += len(profile.alerts)
371
+ for col in profile.columns:
372
+ result.total_alerts += len(col.alerts)
373
+
374
+ # Store in metadata_store.duckdb
375
+ self._store_profile(profile)
376
+
377
+ # Show result line
378
+ status_icon = "[green]OK[/green]" if profile.status == "success" else "[red]FAIL[/red]"
379
+ console.print(
380
+ f" {status_icon} {node_name} "
381
+ f"[dim]({profile.row_count:,} rows, {profile.column_count} cols, {profile.duration_ms}ms)[/dim]"
382
+ )
383
+
384
+ # Show detailed column profile (PipeRider-style)
385
+ self._print_table_profile(profile)
386
+
387
+ progress.advance(main_task)
388
+
389
+ return result
390
+
391
+ def _profile_without_progress(self, nodes: List[Any], result: ProfileExecutionResult) -> ProfileExecutionResult:
392
+ """Profile nodes without Rich (fallback)."""
393
+ for i, node in enumerate(nodes, 1):
394
+ node_name = self._get_node_display_name(node)
395
+ print(f" [{i}/{len(nodes)}] Profiling {node_name}...")
396
+
397
+ profile = self._profile_node(node, i, len(nodes))
398
+ if profile:
399
+ result.profiles.append(profile)
400
+ result.tables_profiled += 1
401
+ result.total_rows += profile.row_count
402
+ result.total_columns += profile.column_count
403
+
404
+ self._store_profile(profile)
405
+
406
+ status = "OK" if profile.status == "success" else "FAIL"
407
+ print(f" {status} ({profile.row_count:,} rows, {profile.column_count} cols)")
408
+
409
+ # Show detailed column profile (text fallback)
410
+ self._print_table_profile(profile)
411
+
412
+ return result
413
+
414
+ def _get_selected_nodes(self) -> List[Any]:
415
+ """Get list of nodes to profile based on selectors."""
416
+ nodes = []
417
+
418
+ # v0.58.7: Use lowercase parameter names (fixed from uppercase)
419
+ selector = getattr(self.args, "select", None)
420
+ exclude = getattr(self.args, "exclude", None)
421
+
422
+ if not selector:
423
+ # Default: profile all sources
424
+ for source_id, source in self.manifest.sources.items():
425
+ nodes.append(source)
426
+ else:
427
+ # Parse selection
428
+ for sel in selector:
429
+ if isinstance(sel, tuple):
430
+ for s in sel:
431
+ nodes.extend(self._parse_selector(s))
432
+ else:
433
+ nodes.extend(self._parse_selector(sel))
434
+
435
+ # Apply exclusions
436
+ if exclude:
437
+ excluded = set()
438
+ for exc in exclude:
439
+ if isinstance(exc, tuple):
440
+ for e in exc:
441
+ excluded.update(self._get_excluded_ids(e))
442
+ else:
443
+ excluded.update(self._get_excluded_ids(exc))
444
+ nodes = [n for n in nodes if self._get_node_id(n) not in excluded]
445
+
446
+ return nodes
447
+
448
+ def _parse_selector(self, selector: str) -> List[Any]:
449
+ """Parse a selector string into nodes."""
450
+ nodes = []
451
+
452
+ if selector.startswith("source:"):
453
+ # Source selector: source:* or source:postgres.*
454
+ pattern = selector[7:] # Remove "source:" prefix
455
+ for source_id, source in self.manifest.sources.items():
456
+ if self._matches_pattern(source, pattern):
457
+ nodes.append(source)
458
+
459
+ elif selector.startswith("model:"):
460
+ # Model selector: model:* or model:staging.*
461
+ pattern = selector[6:] # Remove "model:" prefix
462
+ for node_id, node in self.manifest.nodes.items():
463
+ if hasattr(node, "resource_type") and node.resource_type.value == "model":
464
+ if self._matches_pattern(node, pattern):
465
+ nodes.append(node)
466
+
467
+ elif "*" in selector:
468
+ # Wildcard - match both sources and models
469
+ pattern = selector
470
+ for source_id, source in self.manifest.sources.items():
471
+ if self._matches_pattern(source, pattern):
472
+ nodes.append(source)
473
+ for node_id, node in self.manifest.nodes.items():
474
+ if hasattr(node, "resource_type") and node.resource_type.value == "model":
475
+ if self._matches_pattern(node, pattern):
476
+ nodes.append(node)
477
+
478
+ else:
479
+ # Exact match by name
480
+ for source_id, source in self.manifest.sources.items():
481
+ if source.name == selector or source.identifier == selector:
482
+ nodes.append(source)
483
+ for node_id, node in self.manifest.nodes.items():
484
+ if hasattr(node, "name") and node.name == selector:
485
+ nodes.append(node)
486
+
487
+ return nodes
488
+
489
+ def _matches_pattern(self, node: Any, pattern: str) -> bool:
490
+ """Check if a node matches a glob pattern."""
491
+ import fnmatch
492
+
493
+ if pattern == "*":
494
+ return True
495
+
496
+ name = getattr(node, "name", "")
497
+ identifier = getattr(node, "identifier", name)
498
+ source_name = getattr(node, "source_name", "")
499
+ unique_id = getattr(node, "unique_id", "")
500
+
501
+ # Try matching against different attributes
502
+ full_name = f"{source_name}.{identifier}" if source_name else identifier
503
+
504
+ # Extract just the source_name.table portion from unique_id
505
+ # unique_id format: source.project_name.source_name.table_name
506
+ # We want to match against: project_name.source_name.table_name
507
+ parts = unique_id.split(".")
508
+ if len(parts) >= 4 and parts[0] == "source":
509
+ # project_name.source_name.table_name
510
+ project_source_table = ".".join(parts[1:])
511
+ source_table = ".".join(parts[2:]) # source_name.table_name
512
+ else:
513
+ project_source_table = unique_id
514
+ source_table = full_name
515
+
516
+ return (
517
+ fnmatch.fnmatch(name, pattern) or
518
+ fnmatch.fnmatch(identifier, pattern) or
519
+ fnmatch.fnmatch(full_name, pattern) or
520
+ fnmatch.fnmatch(project_source_table, pattern) or
521
+ fnmatch.fnmatch(source_table, pattern) or
522
+ fnmatch.fnmatch(unique_id, pattern)
523
+ )
524
+
525
+ def _get_excluded_ids(self, exclude_str: str) -> Set[str]:
526
+ """Get IDs of nodes matching exclusion pattern."""
527
+ ids = set()
528
+ nodes = self._parse_selector(exclude_str)
529
+ for node in nodes:
530
+ ids.add(self._get_node_id(node))
531
+ return ids
532
+
533
+ def _get_node_id(self, node: Any) -> str:
534
+ """Get unique ID for a node."""
535
+ if hasattr(node, "unique_id"):
536
+ return node.unique_id
537
+ return getattr(node, "name", str(node))
538
+
539
+ def _get_node_display_name(self, node: Any) -> str:
540
+ """Get display name for a node."""
541
+ if isinstance(node, SourceDefinition):
542
+ return f"{node.source_name}.{node.identifier}"
543
+ else:
544
+ return getattr(node, "name", str(node))
545
+
546
+ def _profile_node(self, node: Any, index: int, total: int) -> Optional[TableProfile]:
547
+ """Profile a single node (source or model)."""
548
+ start_time = time.time()
549
+
550
+ # Get node info
551
+ if isinstance(node, SourceDefinition):
552
+ source_name = node.source_name
553
+ table_name = node.identifier
554
+ connection_name = getattr(node, "config", {}).get("target", "default")
555
+ node_type = "source"
556
+ else:
557
+ source_name = "models"
558
+ table_name = node.name
559
+ connection_name = getattr(node.config, "target", "default") if hasattr(node, "config") else "default"
560
+ node_type = "model"
561
+
562
+ try:
563
+ # Execute profiling
564
+ columns = self._execute_profile(node)
565
+
566
+ duration_ms = int((time.time() - start_time) * 1000)
567
+
568
+ # Calculate totals - use samples field (actual profiled rows)
569
+ row_count = columns[0].samples if columns else 0
570
+ total_rows = columns[0].total if columns else 0
571
+
572
+ # Collect alerts
573
+ alerts = []
574
+ for col in columns:
575
+ alerts.extend(col.alerts)
576
+
577
+ profile = TableProfile(
578
+ source_name=source_name,
579
+ table_name=table_name,
580
+ connection_name=connection_name,
581
+ row_count=total_rows, # Total rows in table
582
+ column_count=len(columns),
583
+ columns=columns,
584
+ profile_mode="standard", # v0.58.7: Single standard mode
585
+ profiled_at=datetime.now(),
586
+ duration_ms=duration_ms,
587
+ alerts=alerts,
588
+ status="success",
589
+ )
590
+
591
+ return profile
592
+
593
+ except Exception as e:
594
+ duration_ms = int((time.time() - start_time) * 1000)
595
+ return TableProfile(
596
+ source_name=source_name,
597
+ table_name=table_name,
598
+ connection_name=connection_name,
599
+ row_count=0,
600
+ column_count=0,
601
+ columns=[],
602
+ profile_mode="standard", # v0.58.7: Single standard mode
603
+ profiled_at=datetime.now(),
604
+ duration_ms=duration_ms,
605
+ status="error",
606
+ error=str(e),
607
+ )
608
+
609
+ def _execute_profile(self, node: Any) -> List[ColumnProfile]:
610
+ """
611
+ Execute PipeRider-style profiling queries on a node.
612
+
613
+ Uses efficient SQL queries to compute:
614
+ - row_count, null_count, distinct_count
615
+ - min, max, mean, stddev (numeric)
616
+ - min_length, max_length, avg_length (string)
617
+ - top_values (categorical)
618
+
619
+ v0.58.7: Added sampling support via --sample flag.
620
+ """
621
+ columns = []
622
+
623
+ # Get table info
624
+ if isinstance(node, SourceDefinition):
625
+ schema = node.schema
626
+ table = node.identifier
627
+ database = getattr(node, "database", None)
628
+ target_name = node.config.get("target") if hasattr(node, "config") else None
629
+ else:
630
+ schema = node.schema
631
+ table = node.alias or node.name
632
+ database = getattr(node, "database", None)
633
+ target_name = getattr(node.config, "target", None) if hasattr(node, "config") else None
634
+
635
+ # Get adapter for connection
636
+ adapter = self._get_adapter(target_name)
637
+
638
+ # Get column info - either from node definition or by querying database
639
+ node_columns = getattr(node, "columns", {})
640
+
641
+ if not node_columns:
642
+ # Query database for column info
643
+ column_info = self._get_columns_from_db(adapter, database, schema, table)
644
+ else:
645
+ column_info = [
646
+ (col_name, getattr(col_info, "data_type", "VARCHAR") or "VARCHAR")
647
+ for col_name, col_info in node_columns.items()
648
+ ]
649
+
650
+ if not column_info:
651
+ # Fallback: profile as single row count only
652
+ row_count = self._get_row_count(adapter, database, schema, table)
653
+ return [ColumnProfile(
654
+ name="_table_",
655
+ type="TABLE",
656
+ schema_type="TABLE",
657
+ total=row_count,
658
+ samples=row_count,
659
+ )]
660
+
661
+ # Get row count once for all columns
662
+ total_row_count = self._get_row_count(adapter, database, schema, table)
663
+
664
+ # v0.58.7: Calculate sample size
665
+ sample_row_count = self._parse_sample(self._sample_str, total_row_count)
666
+ is_sampling = sample_row_count < total_row_count
667
+
668
+ # Profile columns in parallel using threads
669
+ if self._threads > 1 and len(column_info) > 1:
670
+ with ThreadPoolExecutor(max_workers=min(self._threads, len(column_info))) as executor:
671
+ futures = {
672
+ executor.submit(
673
+ self._profile_column_sql,
674
+ adapter, database, schema, table,
675
+ col_name, col_type, total_row_count, sample_row_count
676
+ ): (col_name, col_type)
677
+ for col_name, col_type in column_info
678
+ }
679
+ for future in as_completed(futures):
680
+ try:
681
+ profile = future.result()
682
+ columns.append(profile)
683
+ except Exception as e:
684
+ col_name, col_type = futures[future]
685
+ columns.append(ColumnProfile(
686
+ name=col_name,
687
+ type=self._classify_type(col_type),
688
+ schema_type=col_type,
689
+ total=total_row_count,
690
+ samples=sample_row_count,
691
+ alerts=[{"type": "PROFILE_ERROR", "severity": "warning", "message": str(e)[:100]}]
692
+ ))
693
+ else:
694
+ # Sequential profiling
695
+ for col_name, col_type in column_info:
696
+ profile = self._profile_column_sql(
697
+ adapter, database, schema, table,
698
+ col_name, col_type, total_row_count, sample_row_count
699
+ )
700
+ columns.append(profile)
701
+
702
+ return columns
703
+
704
+ def _get_adapter(self, target_name: Optional[str] = None):
705
+ """Get adapter for the specified target or default."""
706
+ from dbt.adapters.factory import get_adapter
707
+
708
+ # Get adapter from runtime config
709
+ adapter = get_adapter(self.runtime_config)
710
+ return adapter
711
+
712
+ def _get_columns_from_db(
713
+ self, adapter, database: Optional[str], schema: str, table: str
714
+ ) -> List[tuple]:
715
+ """Query database to get column names and types."""
716
+ try:
717
+ # Use adapter's get_columns_in_relation
718
+ from dbt.adapters.base import BaseRelation
719
+
720
+ relation = adapter.Relation.create(
721
+ database=database,
722
+ schema=schema,
723
+ identifier=table,
724
+ )
725
+
726
+ with adapter.connection_named("profile"):
727
+ columns = adapter.get_columns_in_relation(relation)
728
+ return [(col.name, col.dtype) for col in columns]
729
+ except Exception:
730
+ return []
731
+
732
+ def _get_row_count(
733
+ self, adapter, database: Optional[str], schema: str, table: str
734
+ ) -> int:
735
+ """Get row count from table."""
736
+ try:
737
+ fqn = self._build_fqn(adapter, database, schema, table)
738
+ sql = f"SELECT COUNT(*) as cnt FROM {fqn}"
739
+
740
+ with adapter.connection_named("profile"):
741
+ _, result = adapter.execute(sql, fetch=True)
742
+ if result and len(result) > 0:
743
+ return int(result[0][0])
744
+ except Exception:
745
+ pass
746
+ return 0
747
+
748
+ def _build_fqn(
749
+ self, adapter, database: Optional[str], schema: str, table: str
750
+ ) -> str:
751
+ """Build fully qualified table name."""
752
+ parts = []
753
+ if database:
754
+ parts.append(adapter.quote(database))
755
+ if schema:
756
+ parts.append(adapter.quote(schema))
757
+ parts.append(adapter.quote(table))
758
+ return ".".join(parts)
759
+
760
+ def _classify_type(self, col_type: str) -> str:
761
+ """Classify database type into PipeRider generic type."""
762
+ col_type_lower = col_type.lower()
763
+
764
+ if any(t in col_type_lower for t in ["int", "bigint", "smallint", "tinyint", "serial"]):
765
+ return "integer"
766
+ elif any(t in col_type_lower for t in ["numeric", "decimal", "float", "double", "real", "number"]):
767
+ return "numeric"
768
+ elif any(t in col_type_lower for t in ["char", "varchar", "text", "string", "clob"]):
769
+ return "string"
770
+ elif any(t in col_type_lower for t in ["date", "time", "timestamp"]):
771
+ return "datetime"
772
+ elif any(t in col_type_lower for t in ["bool", "boolean"]):
773
+ return "boolean"
774
+ else:
775
+ return "other"
776
+
777
+ def _profile_column_sql(
778
+ self, adapter, database: Optional[str], schema: str, table: str,
779
+ col_name: str, col_type: str, total_row_count: int, sample_row_count: int
780
+ ) -> ColumnProfile:
781
+ """
782
+ Profile a single column using efficient SQL queries.
783
+
784
+ PipeRider-style: Single-pass or minimal queries for all metrics.
785
+ Uses PipeRider metric names: nulls, non_nulls, distinct, valids, etc.
786
+
787
+ v0.58.7: Added sampling support. When sample_row_count < total_row_count,
788
+ SQL queries will use LIMIT clause to sample rows.
789
+
790
+ Args:
791
+ adapter: Database adapter
792
+ database: Database name
793
+ schema: Schema name
794
+ table: Table name
795
+ col_name: Column name to profile
796
+ col_type: Column data type
797
+ total_row_count: Total rows in the table
798
+ sample_row_count: Number of rows to sample (may equal total_row_count)
799
+ """
800
+ start_time = time.time()
801
+
802
+ generic_type = self._classify_type(col_type)
803
+ is_sampling = sample_row_count < total_row_count
804
+ profile = ColumnProfile(
805
+ name=col_name,
806
+ type=generic_type,
807
+ schema_type=col_type,
808
+ total=total_row_count,
809
+ samples=sample_row_count,
810
+ samples_p=sample_row_count / total_row_count if total_row_count > 0 else 1.0,
811
+ )
812
+
813
+ fqn = self._build_fqn(adapter, database, schema, table)
814
+ quoted_col = adapter.quote(col_name)
815
+
816
+ try:
817
+ # Determine column type category
818
+ col_type_lower = col_type.lower()
819
+ is_numeric = any(t in col_type_lower for t in [
820
+ "int", "numeric", "decimal", "float", "double", "real", "number", "bigint", "smallint"
821
+ ])
822
+ is_string = any(t in col_type_lower for t in [
823
+ "char", "varchar", "text", "string", "clob"
824
+ ])
825
+
826
+ # Build comprehensive profiling query based on column type
827
+ if is_numeric:
828
+ profile = self._profile_numeric_column(
829
+ adapter, fqn, quoted_col, col_name, col_type,
830
+ total_row_count, sample_row_count
831
+ )
832
+ elif is_string:
833
+ profile = self._profile_string_column(
834
+ adapter, fqn, quoted_col, col_name, col_type,
835
+ total_row_count, sample_row_count
836
+ )
837
+ else:
838
+ # Default: basic metrics only
839
+ profile = self._profile_basic_column(
840
+ adapter, fqn, quoted_col, col_name, col_type,
841
+ total_row_count, sample_row_count
842
+ )
843
+
844
+ # Get top values for categorical columns
845
+ if profile.distinct and profile.distinct <= 100:
846
+ self._add_top_values(adapter, fqn, quoted_col, profile, sample_row_count)
847
+
848
+ except Exception as e:
849
+ # If SQL fails, return what we have
850
+ profile.alerts.append({
851
+ "type": "PROFILE_ERROR",
852
+ "severity": "warning",
853
+ "message": f"Could not profile column: {str(e)[:100]}",
854
+ })
855
+
856
+ # Generate quality alerts
857
+ profile.alerts.extend(self._generate_alerts(profile))
858
+
859
+ profile.elapsed_milli = int((time.time() - start_time) * 1000)
860
+ profile.profile_duration = f"{(time.time() - start_time):.2f}"
861
+ return profile
862
+
863
+ def _profile_numeric_column(
864
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str,
865
+ total_row_count: int, sample_row_count: int
866
+ ) -> ColumnProfile:
867
+ """Profile a numeric column with all stats in one query (PipeRider-style).
868
+
869
+ v0.58.7: Added sampling support via subquery when sample_row_count < total_row_count.
870
+ """
871
+ generic_type = self._classify_type(col_type)
872
+ is_sampling = sample_row_count < total_row_count
873
+ profile = ColumnProfile(
874
+ name=col_name,
875
+ type=generic_type,
876
+ schema_type=col_type,
877
+ total=total_row_count,
878
+ samples=sample_row_count,
879
+ samples_p=sample_row_count / total_row_count if total_row_count > 0 else 1.0,
880
+ )
881
+
882
+ # Build source expression - use subquery with LIMIT when sampling
883
+ if is_sampling:
884
+ source_expr = f"(SELECT * FROM {fqn} LIMIT {sample_row_count}) AS sampled"
885
+ else:
886
+ source_expr = fqn
887
+
888
+ # Single comprehensive query for numeric columns (PipeRider-style)
889
+ sql = f"""
890
+ SELECT
891
+ COUNT(*) - COUNT({quoted_col}) as nulls,
892
+ COUNT({quoted_col}) as non_nulls,
893
+ COUNT(DISTINCT {quoted_col}) as distinct_val,
894
+ MIN({quoted_col}) as min_val,
895
+ MAX({quoted_col}) as max_val,
896
+ SUM(CAST({quoted_col} AS DOUBLE PRECISION)) as sum_val,
897
+ AVG(CAST({quoted_col} AS DOUBLE PRECISION)) as avg_val,
898
+ STDDEV(CAST({quoted_col} AS DOUBLE PRECISION)) as stddev_val,
899
+ SUM(CASE WHEN {quoted_col} = 0 THEN 1 ELSE 0 END) as zeros,
900
+ SUM(CASE WHEN {quoted_col} < 0 THEN 1 ELSE 0 END) as negatives,
901
+ SUM(CASE WHEN {quoted_col} > 0 THEN 1 ELSE 0 END) as positives
902
+ FROM {source_expr}
903
+ """
904
+
905
+ try:
906
+ with adapter.connection_named("profile"):
907
+ _, result = adapter.execute(sql, fetch=True)
908
+ if result and len(result) > 0:
909
+ row = result[0]
910
+ # PipeRider-style metric names
911
+ profile.nulls = int(row[0] or 0)
912
+ profile.non_nulls = int(row[1] or 0)
913
+ profile.distinct = int(row[2] or 0)
914
+ profile.min = float(row[3]) if row[3] is not None else None
915
+ profile.max = float(row[4]) if row[4] is not None else None
916
+ profile.sum = float(row[5]) if row[5] is not None else None
917
+ profile.avg = float(row[6]) if row[6] is not None else None
918
+ profile.stddev = float(row[7]) if row[7] is not None else None
919
+ profile.zeros = int(row[8] or 0)
920
+ profile.negatives = int(row[9] or 0)
921
+ profile.positives = int(row[10] or 0)
922
+
923
+ # Calculate percentages based on sampled rows (PipeRider-style with decimal 0-1)
924
+ sampled = profile.samples
925
+ if sampled > 0:
926
+ profile.nulls_p = profile.nulls / sampled
927
+ profile.non_nulls_p = profile.non_nulls / sampled
928
+ profile.distinct_p = profile.distinct / sampled if profile.non_nulls > 0 else None
929
+ profile.zeros_p = profile.zeros / sampled
930
+ profile.negatives_p = profile.negatives / sampled
931
+ profile.positives_p = profile.positives / sampled
932
+
933
+ # Validity metrics (for numeric, valid = non-null)
934
+ profile.valids = profile.non_nulls
935
+ profile.valids_p = profile.non_nulls_p
936
+ profile.invalids = profile.nulls
937
+ profile.invalids_p = profile.nulls_p
938
+
939
+ # Duplicate metrics
940
+ if profile.non_nulls > 0 and profile.distinct > 0:
941
+ profile.non_duplicates = profile.distinct
942
+ profile.duplicates = profile.non_nulls - profile.distinct
943
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
944
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
945
+
946
+ # Always get percentiles in standard mode (v0.58.7: single comprehensive mode)
947
+ self._add_percentiles(adapter, fqn, quoted_col, profile, sample_row_count)
948
+
949
+ except Exception:
950
+ # Fall back to basic profile
951
+ profile = self._profile_basic_column(
952
+ adapter, fqn, quoted_col, col_name, col_type,
953
+ total_row_count, sample_row_count
954
+ )
955
+
956
+ return profile
957
+
958
+ def _profile_string_column(
959
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str,
960
+ total_row_count: int, sample_row_count: int
961
+ ) -> ColumnProfile:
962
+ """Profile a string column with all stats in one query (PipeRider-style).
963
+
964
+ v0.58.7: Added sampling support via subquery when sample_row_count < total_row_count.
965
+ """
966
+ generic_type = self._classify_type(col_type)
967
+ is_sampling = sample_row_count < total_row_count
968
+ profile = ColumnProfile(
969
+ name=col_name,
970
+ type=generic_type,
971
+ schema_type=col_type,
972
+ total=total_row_count,
973
+ samples=sample_row_count,
974
+ samples_p=sample_row_count / total_row_count if total_row_count > 0 else 1.0,
975
+ )
976
+
977
+ # Build source expression - use subquery with LIMIT when sampling
978
+ if is_sampling:
979
+ source_expr = f"(SELECT * FROM {fqn} LIMIT {sample_row_count}) AS sampled"
980
+ else:
981
+ source_expr = fqn
982
+
983
+ # Single comprehensive query for string columns (PipeRider-style)
984
+ sql = f"""
985
+ SELECT
986
+ COUNT(*) - COUNT({quoted_col}) as nulls,
987
+ COUNT({quoted_col}) as non_nulls,
988
+ COUNT(DISTINCT {quoted_col}) as distinct_val,
989
+ MIN(LENGTH({quoted_col})) as min_len,
990
+ MAX(LENGTH({quoted_col})) as max_len,
991
+ AVG(LENGTH({quoted_col})) as avg_len,
992
+ SUM(CASE WHEN LENGTH({quoted_col}) = 0 THEN 1 ELSE 0 END) as zero_length_count
993
+ FROM {source_expr}
994
+ """
995
+
996
+ try:
997
+ with adapter.connection_named("profile"):
998
+ _, result = adapter.execute(sql, fetch=True)
999
+ if result and len(result) > 0:
1000
+ row = result[0]
1001
+ # PipeRider-style metric names
1002
+ profile.nulls = int(row[0] or 0)
1003
+ profile.non_nulls = int(row[1] or 0)
1004
+ profile.distinct = int(row[2] or 0)
1005
+ profile.min_length = int(row[3]) if row[3] is not None else None
1006
+ profile.max_length = int(row[4]) if row[4] is not None else None
1007
+ profile.avg_length = float(row[5]) if row[5] is not None else None
1008
+ profile.zero_length = int(row[6] or 0)
1009
+
1010
+ # Calculate percentages based on sampled rows (PipeRider-style with decimal 0-1)
1011
+ sampled = profile.samples
1012
+ if sampled > 0:
1013
+ profile.nulls_p = profile.nulls / sampled
1014
+ profile.non_nulls_p = profile.non_nulls / sampled
1015
+ profile.distinct_p = profile.distinct / sampled if profile.non_nulls > 0 else None
1016
+ profile.zero_length_p = profile.zero_length / sampled
1017
+
1018
+ # Validity metrics (for string, valid = non-null non-empty)
1019
+ profile.valids = profile.non_nulls - profile.zero_length
1020
+ profile.invalids = profile.nulls + profile.zero_length
1021
+ if sampled > 0:
1022
+ profile.valids_p = profile.valids / sampled
1023
+ profile.invalids_p = profile.invalids / sampled
1024
+
1025
+ # Non-zero length
1026
+ profile.non_zero_length = profile.non_nulls - profile.zero_length
1027
+ if profile.non_nulls > 0:
1028
+ profile.non_zero_length_p = profile.non_zero_length / profile.non_nulls
1029
+
1030
+ # Duplicate metrics
1031
+ if profile.non_nulls > 0 and profile.distinct > 0:
1032
+ profile.non_duplicates = profile.distinct
1033
+ profile.duplicates = profile.non_nulls - profile.distinct
1034
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
1035
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
1036
+
1037
+ except Exception:
1038
+ # Fall back to basic profile
1039
+ profile = self._profile_basic_column(
1040
+ adapter, fqn, quoted_col, col_name, col_type,
1041
+ total_row_count, sample_row_count
1042
+ )
1043
+
1044
+ return profile
1045
+
1046
+ def _profile_basic_column(
1047
+ self, adapter, fqn: str, quoted_col: str, col_name: str, col_type: str,
1048
+ total_row_count: int, sample_row_count: int
1049
+ ) -> ColumnProfile:
1050
+ """Profile any column with basic metrics only (PipeRider-style).
1051
+
1052
+ v0.58.7: Added sampling support via subquery when sample_row_count < total_row_count.
1053
+ """
1054
+ generic_type = self._classify_type(col_type)
1055
+ is_sampling = sample_row_count < total_row_count
1056
+ profile = ColumnProfile(
1057
+ name=col_name,
1058
+ type=generic_type,
1059
+ schema_type=col_type,
1060
+ total=total_row_count,
1061
+ samples=sample_row_count,
1062
+ samples_p=sample_row_count / total_row_count if total_row_count > 0 else 1.0,
1063
+ )
1064
+
1065
+ # Build source expression - use subquery with LIMIT when sampling
1066
+ if is_sampling:
1067
+ source_expr = f"(SELECT * FROM {fqn} LIMIT {sample_row_count}) AS sampled"
1068
+ else:
1069
+ source_expr = fqn
1070
+
1071
+ sql = f"""
1072
+ SELECT
1073
+ COUNT(*) - COUNT({quoted_col}) as nulls,
1074
+ COUNT({quoted_col}) as non_nulls,
1075
+ COUNT(DISTINCT {quoted_col}) as distinct_val
1076
+ FROM {source_expr}
1077
+ """
1078
+
1079
+ try:
1080
+ with adapter.connection_named("profile"):
1081
+ _, result = adapter.execute(sql, fetch=True)
1082
+ if result and len(result) > 0:
1083
+ profile.nulls = int(result[0][0] or 0)
1084
+ profile.non_nulls = int(result[0][1] or 0)
1085
+ profile.distinct = int(result[0][2] or 0)
1086
+
1087
+ # Calculate percentages based on sampled rows (PipeRider-style with decimal 0-1)
1088
+ sampled = profile.samples
1089
+ if sampled > 0:
1090
+ profile.nulls_p = profile.nulls / sampled
1091
+ profile.non_nulls_p = profile.non_nulls / sampled
1092
+ profile.distinct_p = profile.distinct / sampled if profile.non_nulls > 0 else None
1093
+
1094
+ # Validity metrics
1095
+ profile.valids = profile.non_nulls
1096
+ profile.valids_p = profile.non_nulls_p
1097
+ profile.invalids = profile.nulls
1098
+ profile.invalids_p = profile.nulls_p
1099
+
1100
+ # Duplicate metrics
1101
+ if profile.non_nulls > 0 and profile.distinct > 0:
1102
+ profile.non_duplicates = profile.distinct
1103
+ profile.duplicates = profile.non_nulls - profile.distinct
1104
+ profile.non_duplicates_p = profile.non_duplicates / profile.non_nulls
1105
+ profile.duplicates_p = profile.duplicates / profile.non_nulls
1106
+
1107
+ except Exception:
1108
+ pass
1109
+
1110
+ return profile
1111
+
1112
+ def _add_percentiles(
1113
+ self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile,
1114
+ sample_row_count: int = None
1115
+ ) -> None:
1116
+ """Try to add percentiles to numeric profile.
1117
+
1118
+ v0.58.7: Added sampling support via subquery when sample_row_count is provided.
1119
+ """
1120
+ try:
1121
+ # Build source expression for sampling
1122
+ if sample_row_count and sample_row_count < profile.total:
1123
+ source_expr = f"(SELECT {quoted_col} FROM {fqn} WHERE {quoted_col} IS NOT NULL LIMIT {sample_row_count}) AS sampled"
1124
+ where_clause = ""
1125
+ else:
1126
+ source_expr = fqn
1127
+ where_clause = f"WHERE {quoted_col} IS NOT NULL"
1128
+
1129
+ # Try PostgreSQL/Redshift style
1130
+ percentile_sql = f"""
1131
+ SELECT
1132
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {quoted_col}) as p25,
1133
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY {quoted_col}) as p50,
1134
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {quoted_col}) as p75
1135
+ FROM {source_expr}
1136
+ {where_clause}
1137
+ """
1138
+ with adapter.connection_named("profile"):
1139
+ _, result = adapter.execute(percentile_sql, fetch=True)
1140
+ if result and len(result) > 0:
1141
+ row = result[0]
1142
+ profile.p25 = float(row[0]) if row[0] is not None else None
1143
+ profile.p50 = float(row[1]) if row[1] is not None else None
1144
+ profile.p75 = float(row[2]) if row[2] is not None else None
1145
+ except Exception:
1146
+ # Percentiles not supported on this database
1147
+ pass
1148
+
1149
+ def _add_top_values(
1150
+ self, adapter, fqn: str, quoted_col: str, profile: ColumnProfile,
1151
+ sample_row_count: int = None
1152
+ ) -> None:
1153
+ """Add top values to profile (PipeRider topk format).
1154
+
1155
+ v0.58.7: Added sampling support via subquery when sample_row_count is provided.
1156
+ """
1157
+ try:
1158
+ # Build source expression for sampling
1159
+ if sample_row_count and sample_row_count < profile.total:
1160
+ source_expr = f"(SELECT * FROM {fqn} LIMIT {sample_row_count}) AS sampled"
1161
+ else:
1162
+ source_expr = fqn
1163
+
1164
+ top_sql = f"""
1165
+ SELECT {quoted_col} as val, COUNT(*) as cnt
1166
+ FROM {source_expr}
1167
+ WHERE {quoted_col} IS NOT NULL
1168
+ GROUP BY {quoted_col}
1169
+ ORDER BY cnt DESC
1170
+ LIMIT 10
1171
+ """
1172
+ with adapter.connection_named("profile"):
1173
+ _, result = adapter.execute(top_sql, fetch=True)
1174
+ if result:
1175
+ # PipeRider topk format: {"values": [...], "counts": [...]}
1176
+ values = [str(row[0]) for row in result]
1177
+ counts = [int(row[1]) for row in result]
1178
+ profile.topk = {
1179
+ "values": values,
1180
+ "counts": counts,
1181
+ }
1182
+ except Exception:
1183
+ pass
1184
+
1185
+ def _generate_alerts(self, profile: ColumnProfile) -> List[Dict]:
1186
+ """
1187
+ Generate quality alerts for a column profile (PipeRider-style).
1188
+
1189
+ PipeRider alert types (from piperider_cli/profiler/event.py):
1190
+ - missing_value: High percentage of null/missing values
1191
+ - high_distinct: Very high cardinality (possible PK)
1192
+ - low_distinct: Very low cardinality (possible boolean/flag)
1193
+ - all_null: 100% null values
1194
+ - constant: All values are the same
1195
+ - negative_value: Has negative values in numeric column
1196
+ - zero_length_string: Has empty strings
1197
+ """
1198
+ alerts = []
1199
+
1200
+ # Get null percentage (as 0-100 for comparison)
1201
+ nulls_pct = (profile.nulls_p or 0) * 100 if profile.nulls_p is not None else 0
1202
+ distinct_pct = (profile.distinct_p or 0) * 100 if profile.distinct_p is not None else 0
1203
+
1204
+ # High null rate alert (PipeRider: missing_value)
1205
+ if nulls_pct > 50:
1206
+ alerts.append({
1207
+ "type": "missing_value",
1208
+ "severity": "error",
1209
+ "column": profile.name,
1210
+ "message": f"Column has {nulls_pct:.1f}% null values (>50%)",
1211
+ })
1212
+ elif nulls_pct > 20:
1213
+ alerts.append({
1214
+ "type": "missing_value",
1215
+ "severity": "warning",
1216
+ "column": profile.name,
1217
+ "message": f"Column has {nulls_pct:.1f}% null values",
1218
+ })
1219
+
1220
+ # High cardinality alert (PipeRider: high_distinct)
1221
+ if distinct_pct > 99 and profile.samples > 100:
1222
+ alerts.append({
1223
+ "type": "high_distinct",
1224
+ "severity": "info",
1225
+ "column": profile.name,
1226
+ "message": f"Column is {distinct_pct:.1f}% unique (possible primary key)",
1227
+ })
1228
+
1229
+ # Low cardinality (PipeRider: low_distinct)
1230
+ if profile.distinct and profile.distinct < 10 and profile.samples > 1000:
1231
+ alerts.append({
1232
+ "type": "low_distinct",
1233
+ "severity": "info",
1234
+ "column": profile.name,
1235
+ "message": f"Column has only {profile.distinct} distinct values (possible category)",
1236
+ })
1237
+
1238
+ # All nulls alert (PipeRider: all_null)
1239
+ if nulls_pct >= 100 or (profile.non_nulls == 0 and profile.nulls > 0):
1240
+ alerts.append({
1241
+ "type": "all_null",
1242
+ "severity": "error",
1243
+ "column": profile.name,
1244
+ "message": "Column is 100% null - consider removing",
1245
+ })
1246
+
1247
+ # Zero variance / Constant alert (PipeRider: constant)
1248
+ if profile.min is not None and profile.max is not None:
1249
+ if profile.min == profile.max and profile.distinct == 1:
1250
+ alerts.append({
1251
+ "type": "constant",
1252
+ "severity": "warning",
1253
+ "column": profile.name,
1254
+ "message": f"Column has constant value: {profile.min}",
1255
+ })
1256
+
1257
+ # Negative values (PipeRider: negative_value) - informational only
1258
+ if profile.negatives and profile.negatives > 0:
1259
+ negatives_pct = (profile.negatives_p or 0) * 100
1260
+ if negatives_pct > 50:
1261
+ alerts.append({
1262
+ "type": "negative_value",
1263
+ "severity": "info",
1264
+ "column": profile.name,
1265
+ "message": f"Column has {negatives_pct:.1f}% negative values",
1266
+ })
1267
+
1268
+ # Zero-length strings (PipeRider: zero_length_string)
1269
+ if profile.zero_length and profile.zero_length > 0:
1270
+ zero_len_pct = (profile.zero_length_p or 0) * 100
1271
+ if zero_len_pct > 10:
1272
+ alerts.append({
1273
+ "type": "zero_length_string",
1274
+ "severity": "warning",
1275
+ "column": profile.name,
1276
+ "message": f"Column has {zero_len_pct:.1f}% empty strings",
1277
+ })
1278
+
1279
+ return alerts
1280
+
1281
+ def _print_table_profile(self, profile: TableProfile) -> None:
1282
+ """Print detailed column profile in PipeRider style.
1283
+
1284
+ v0.58.7: Added column-level display with key metrics.
1285
+ """
1286
+ if not profile.columns:
1287
+ return
1288
+
1289
+ if HAS_RICH:
1290
+ # Rich table for column details
1291
+ table = Table(
1292
+ box=box.SIMPLE,
1293
+ show_header=True,
1294
+ padding=(0, 1),
1295
+ expand=False,
1296
+ )
1297
+ table.add_column("Column", style="cyan", no_wrap=True)
1298
+ table.add_column("Type", style="dim", no_wrap=True)
1299
+ table.add_column("Non-Null", justify="right")
1300
+ table.add_column("Distinct", justify="right")
1301
+ table.add_column("Min", justify="right", max_width=12)
1302
+ table.add_column("Max", justify="right", max_width=12)
1303
+ table.add_column("Mean", justify="right")
1304
+
1305
+ for col in profile.columns:
1306
+ # Format percentages
1307
+ non_null_pct = f"{(col.non_nulls_p or 0) * 100:.0f}%" if col.non_nulls_p is not None else "-"
1308
+ distinct_pct = f"{(col.distinct_p or 0) * 100:.0f}%" if col.distinct_p is not None else "-"
1309
+
1310
+ # Format min/max values (truncate if too long)
1311
+ def fmt_val(val, max_len=10):
1312
+ if val is None:
1313
+ return "-"
1314
+ s = str(val)
1315
+ return s[:max_len] + "..." if len(s) > max_len else s
1316
+
1317
+ min_val = fmt_val(col.min)
1318
+ max_val = fmt_val(col.max)
1319
+ mean_val = f"{col.avg:.2f}" if col.avg is not None else "-"
1320
+
1321
+ table.add_row(
1322
+ col.name,
1323
+ col.type,
1324
+ non_null_pct,
1325
+ distinct_pct,
1326
+ min_val,
1327
+ max_val,
1328
+ mean_val,
1329
+ )
1330
+
1331
+ console.print(table)
1332
+ console.print() # Blank line after table
1333
+ else:
1334
+ # Text fallback
1335
+ print(" Column Details:")
1336
+ print(" " + "-" * 60)
1337
+ for col in profile.columns:
1338
+ non_null_pct = f"{(col.non_nulls_p or 0) * 100:.0f}%" if col.non_nulls_p is not None else "-"
1339
+ distinct_pct = f"{(col.distinct_p or 0) * 100:.0f}%" if col.distinct_p is not None else "-"
1340
+ print(f" {col.name:20} {col.type:10} Non-Null: {non_null_pct:5} Distinct: {distinct_pct:5}")
1341
+ print()
1342
+
1343
+ def _store_profile(self, profile: TableProfile) -> None:
1344
+ """Store profile results in metadata_store.duckdb.
1345
+
1346
+ v0.58.7: Fixed field mappings from ColumnProfile to ColumnProfileResult.
1347
+ """
1348
+ try:
1349
+ # Check if DuckDB is available
1350
+ try:
1351
+ import duckdb
1352
+ except ImportError:
1353
+ if HAS_RICH:
1354
+ console.print("[yellow]Warning: DuckDB not available. Profile results will not be persisted.[/yellow]")
1355
+ return
1356
+
1357
+ from dbt.compute.metadata import ProjectMetadataStore, ColumnProfileResult
1358
+
1359
+ project_root = Path(self.runtime_config.project_root)
1360
+ store = ProjectMetadataStore(project_root)
1361
+ store.initialize()
1362
+
1363
+ for col in profile.columns:
1364
+ # Map ColumnProfile fields to ColumnProfileResult fields
1365
+ # ColumnProfile uses PipeRider-style names (name, samples, nulls, distinct, etc.)
1366
+ # ColumnProfileResult uses legacy-style names (column_name, row_count, null_count, etc.)
1367
+ result = ColumnProfileResult(
1368
+ source_name=profile.source_name,
1369
+ table_name=profile.table_name,
1370
+ column_name=col.name, # PipeRider field: name
1371
+ profile_mode=profile.profile_mode,
1372
+ row_count=col.samples, # PipeRider field: samples
1373
+ null_count=col.nulls, # PipeRider field: nulls
1374
+ null_percent=(col.nulls_p or 0.0) * 100, # Convert decimal to percentage
1375
+ distinct_count=col.distinct, # PipeRider field: distinct
1376
+ distinct_percent=(col.distinct_p or 0.0) * 100, # Convert decimal to percentage
1377
+ min_value=col.min, # PipeRider field: min
1378
+ max_value=col.max, # PipeRider field: max
1379
+ mean_value=col.avg, # PipeRider field: avg
1380
+ median_value=col.p50, # Use p50 as median
1381
+ stddev_value=col.stddev, # PipeRider field: stddev
1382
+ p25=col.p25,
1383
+ p50=col.p50,
1384
+ p75=col.p75,
1385
+ min_length=col.min_length,
1386
+ max_length=col.max_length,
1387
+ avg_length=col.avg_length,
1388
+ histogram=json.dumps(col.histogram) if col.histogram else None,
1389
+ top_values=json.dumps(col.topk) if col.topk else None, # PipeRider field: topk
1390
+ alerts=json.dumps(col.alerts) if col.alerts else None,
1391
+ profiled_at=profile.profiled_at,
1392
+ duration_ms=col.elapsed_milli, # PipeRider field: elapsed_milli
1393
+ )
1394
+ store.save_profile_result(result)
1395
+
1396
+ store.close()
1397
+
1398
+ except Exception as e:
1399
+ # Log but don't fail if storage fails
1400
+ if HAS_RICH:
1401
+ console.print(f"[yellow]Warning: Could not store profile results: {e}[/yellow]")
1402
+
1403
+ def _print_summary(self, result: ProfileExecutionResult) -> None:
1404
+ """Print PipeRider-style summary with Rich formatting."""
1405
+ if not HAS_RICH:
1406
+ print("\n" + "=" * 60)
1407
+ print(" SUMMARY")
1408
+ print(f" Tables profiled: {result.tables_profiled}")
1409
+ print(f" Total rows: {result.total_rows:,}")
1410
+ print(f" Total columns: {result.total_columns}")
1411
+ print(f" Alerts: {result.total_alerts}")
1412
+ print(f" Duration: {result.duration_ms / 1000:.1f}s")
1413
+ print("=" * 60 + "\n")
1414
+ return
1415
+
1416
+ console.print()
1417
+
1418
+ # Summary panel
1419
+ summary_lines = [
1420
+ f"[bold]Tables profiled:[/bold] {result.tables_profiled}",
1421
+ f"[bold]Total rows:[/bold] {result.total_rows:,}",
1422
+ f"[bold]Total columns:[/bold] {result.total_columns}",
1423
+ ]
1424
+
1425
+ if result.total_alerts > 0:
1426
+ summary_lines.append(f"[bold yellow]Alerts:[/bold yellow] {result.total_alerts}")
1427
+ else:
1428
+ summary_lines.append(f"[bold green]Alerts:[/bold green] 0")
1429
+
1430
+ summary_lines.append(f"[dim]Duration:[/dim] {result.duration_ms / 1000:.1f}s")
1431
+
1432
+ console.print(Panel(
1433
+ "\n".join(summary_lines),
1434
+ title="[bold cyan]Summary[/bold cyan]",
1435
+ border_style="cyan",
1436
+ box=box.ROUNDED,
1437
+ ))
1438
+
1439
+ # List alerts if any
1440
+ if result.total_alerts > 0:
1441
+ console.print()
1442
+ console.print("[bold yellow]Alerts:[/bold yellow]")
1443
+ console.print()
1444
+
1445
+ alerts_table = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
1446
+ alerts_table.add_column("Severity", style="bold", width=8)
1447
+ alerts_table.add_column("Type", style="cyan", width=15)
1448
+ alerts_table.add_column("Location", style="white", width=30)
1449
+ alerts_table.add_column("Message", style="dim")
1450
+
1451
+ for profile in result.profiles:
1452
+ for col in profile.columns:
1453
+ for alert in col.alerts:
1454
+ if alert["severity"] == "error":
1455
+ sev_display = "[red]ERROR[/red]"
1456
+ elif alert["severity"] == "warning":
1457
+ sev_display = "[yellow]WARN[/yellow]"
1458
+ else:
1459
+ sev_display = "[blue]INFO[/blue]"
1460
+
1461
+ location = f"{profile.table_name}.{col.column_name}"
1462
+ alerts_table.add_row(
1463
+ sev_display,
1464
+ alert["type"],
1465
+ location,
1466
+ alert["message"]
1467
+ )
1468
+
1469
+ console.print(alerts_table)
1470
+
1471
+ console.print()
1472
+
1473
+ # Success footer
1474
+ if result.tables_profiled > 0:
1475
+ console.print("[bold green]Profiling complete![/bold green]")
1476
+ console.print()
1477
+ console.print("[cyan]Results saved to:[/cyan] [bold].dvt/metadata_store.duckdb[/bold]")
1478
+ console.print("[dim]View report: dvt profile serve[/dim]")
1479
+ else:
1480
+ console.print("[yellow]No tables were profiled.[/yellow]")
1481
+
1482
+ console.print()
1483
+
1484
+ def interpret_results(self, result: ProfileExecutionResult) -> bool:
1485
+ """Interpret results to determine success/failure."""
1486
+ if not result.profiles:
1487
+ return False
1488
+ # Success if at least one profile completed
1489
+ return any(p.status == "success" for p in result.profiles)