kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# src/kontra/scout/backends/duckdb_backend.py
|
|
2
|
+
"""
|
|
3
|
+
DuckDB backend for Scout profiler.
|
|
4
|
+
|
|
5
|
+
Supports Parquet and CSV files (local + S3/HTTP).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
import duckdb
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import pyarrow.parquet as pq
|
|
17
|
+
import pyarrow.fs as pafs
|
|
18
|
+
|
|
19
|
+
_HAS_PYARROW = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
_HAS_PYARROW = False
|
|
22
|
+
|
|
23
|
+
from kontra.connectors.handle import DatasetHandle
|
|
24
|
+
from kontra.engine.backends.duckdb_session import create_duckdb_connection
|
|
25
|
+
from kontra.engine.backends.duckdb_utils import esc_ident as duckdb_esc_ident
|
|
26
|
+
from kontra.engine.backends.duckdb_utils import lit_str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DuckDBBackend:
|
|
30
|
+
"""
|
|
31
|
+
DuckDB-based profiler backend for Parquet and CSV files.
|
|
32
|
+
|
|
33
|
+
Features:
|
|
34
|
+
- Parquet metadata extraction (row count from footer)
|
|
35
|
+
- Single-pass aggregation queries
|
|
36
|
+
- Sampling support
|
|
37
|
+
- S3/HTTP support via DuckDB httpfs
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
handle: DatasetHandle,
|
|
43
|
+
*,
|
|
44
|
+
sample_size: Optional[int] = None,
|
|
45
|
+
):
|
|
46
|
+
self.handle = handle
|
|
47
|
+
self.sample_size = sample_size
|
|
48
|
+
self.con: Optional[duckdb.DuckDBPyConnection] = None
|
|
49
|
+
self._parquet_metadata: Optional[Any] = None
|
|
50
|
+
self._view_name = "_scout"
|
|
51
|
+
|
|
52
|
+
def connect(self) -> None:
|
|
53
|
+
"""Create DuckDB connection and source view."""
|
|
54
|
+
self.con = create_duckdb_connection(self.handle)
|
|
55
|
+
self._create_source_view()
|
|
56
|
+
|
|
57
|
+
def close(self) -> None:
|
|
58
|
+
"""Clean up resources."""
|
|
59
|
+
if self.con:
|
|
60
|
+
try:
|
|
61
|
+
self.con.execute(f"DROP VIEW IF EXISTS {self._view_name}")
|
|
62
|
+
except Exception:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
def get_schema(self) -> List[Tuple[str, str]]:
|
|
66
|
+
"""Return [(column_name, raw_type), ...]"""
|
|
67
|
+
cur = self.con.execute(f"SELECT * FROM {self._view_name} LIMIT 0")
|
|
68
|
+
return [(d[0], str(d[1])) for d in cur.description]
|
|
69
|
+
|
|
70
|
+
def get_row_count(self) -> int:
|
|
71
|
+
"""
|
|
72
|
+
Get row count, using Parquet metadata if available.
|
|
73
|
+
|
|
74
|
+
For Parquet files, the row count is extracted from the footer
|
|
75
|
+
without scanning data (fast). For CSV/other formats, a COUNT query is used.
|
|
76
|
+
"""
|
|
77
|
+
# Try Parquet metadata first (no scan)
|
|
78
|
+
if self.handle.format == "parquet" and _HAS_PYARROW and self.sample_size is None:
|
|
79
|
+
try:
|
|
80
|
+
meta = self._get_parquet_metadata()
|
|
81
|
+
if meta:
|
|
82
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
83
|
+
print(f"[INFO] Parquet metadata: {meta.num_rows} rows from footer")
|
|
84
|
+
return meta.num_rows
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
# Fall back to query
|
|
89
|
+
result = self.con.execute(f"SELECT COUNT(*) FROM {self._view_name}").fetchone()
|
|
90
|
+
return int(result[0]) if result else 0
|
|
91
|
+
|
|
92
|
+
def get_estimated_size_bytes(self) -> Optional[int]:
|
|
93
|
+
"""Get estimated size from Parquet metadata."""
|
|
94
|
+
if self.handle.format == "parquet" and _HAS_PYARROW:
|
|
95
|
+
try:
|
|
96
|
+
meta = self._get_parquet_metadata()
|
|
97
|
+
if meta:
|
|
98
|
+
return meta.serialized_size
|
|
99
|
+
except Exception:
|
|
100
|
+
pass
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
|
|
104
|
+
"""Execute aggregation query with multiple expressions."""
|
|
105
|
+
if not exprs:
|
|
106
|
+
return {}
|
|
107
|
+
|
|
108
|
+
sql = f"SELECT {', '.join(exprs)} FROM {self._view_name}"
|
|
109
|
+
cur = self.con.execute(sql)
|
|
110
|
+
row = cur.fetchone()
|
|
111
|
+
col_names = [d[0] for d in cur.description]
|
|
112
|
+
return dict(zip(col_names, row)) if row else {}
|
|
113
|
+
|
|
114
|
+
def fetch_top_values(self, column: str, limit: int) -> List[Tuple[Any, int]]:
|
|
115
|
+
"""Fetch top N most frequent values."""
|
|
116
|
+
col = self.esc_ident(column)
|
|
117
|
+
sql = f"""
|
|
118
|
+
SELECT {col} AS val, COUNT(*) AS cnt
|
|
119
|
+
FROM {self._view_name}
|
|
120
|
+
WHERE {col} IS NOT NULL
|
|
121
|
+
GROUP BY {col}
|
|
122
|
+
ORDER BY cnt DESC
|
|
123
|
+
LIMIT {limit}
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
rows = self.con.execute(sql).fetchall()
|
|
127
|
+
return [(r[0], int(r[1])) for r in rows]
|
|
128
|
+
except Exception:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
def fetch_distinct_values(self, column: str) -> List[Any]:
|
|
132
|
+
"""Fetch all distinct values for a column."""
|
|
133
|
+
col = self.esc_ident(column)
|
|
134
|
+
sql = f"""
|
|
135
|
+
SELECT DISTINCT {col}
|
|
136
|
+
FROM {self._view_name}
|
|
137
|
+
WHERE {col} IS NOT NULL
|
|
138
|
+
ORDER BY {col}
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
rows = self.con.execute(sql).fetchall()
|
|
142
|
+
return [r[0] for r in rows]
|
|
143
|
+
except Exception:
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
|
|
147
|
+
"""Fetch sample values for pattern detection."""
|
|
148
|
+
col = self.esc_ident(column)
|
|
149
|
+
sql = f"""
|
|
150
|
+
SELECT {col}
|
|
151
|
+
FROM {self._view_name}
|
|
152
|
+
WHERE {col} IS NOT NULL
|
|
153
|
+
LIMIT {limit}
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
rows = self.con.execute(sql).fetchall()
|
|
157
|
+
return [r[0] for r in rows if r[0] is not None]
|
|
158
|
+
except Exception:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
def esc_ident(self, name: str) -> str:
|
|
162
|
+
"""Escape identifier for DuckDB."""
|
|
163
|
+
return duckdb_esc_ident(name)
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def source_format(self) -> str:
|
|
167
|
+
"""Return source format."""
|
|
168
|
+
return self.handle.format or "unknown"
|
|
169
|
+
|
|
170
|
+
# ----------------------------- Internal methods -----------------------------
|
|
171
|
+
|
|
172
|
+
def _create_source_view(self) -> None:
|
|
173
|
+
"""Create a DuckDB view over the source, optionally with sampling."""
|
|
174
|
+
fmt = (self.handle.format or "").lower()
|
|
175
|
+
uri = self.handle.uri
|
|
176
|
+
|
|
177
|
+
if fmt == "parquet":
|
|
178
|
+
read_fn = f"read_parquet({lit_str(uri)})"
|
|
179
|
+
elif fmt == "csv":
|
|
180
|
+
read_fn = f"read_csv_auto({lit_str(uri)})"
|
|
181
|
+
else:
|
|
182
|
+
# Try parquet first
|
|
183
|
+
read_fn = f"read_parquet({lit_str(uri)})"
|
|
184
|
+
|
|
185
|
+
if self.sample_size:
|
|
186
|
+
sql = f"""
|
|
187
|
+
CREATE OR REPLACE VIEW {self._view_name} AS
|
|
188
|
+
SELECT * FROM {read_fn}
|
|
189
|
+
USING SAMPLE {int(self.sample_size)} ROWS
|
|
190
|
+
"""
|
|
191
|
+
else:
|
|
192
|
+
sql = f"CREATE OR REPLACE VIEW {self._view_name} AS SELECT * FROM {read_fn}"
|
|
193
|
+
|
|
194
|
+
self.con.execute(sql)
|
|
195
|
+
|
|
196
|
+
def _get_parquet_metadata(self) -> Optional[Any]:
|
|
197
|
+
"""Extract Parquet metadata without reading data."""
|
|
198
|
+
if not _HAS_PYARROW:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
if self._parquet_metadata is not None:
|
|
202
|
+
return self._parquet_metadata
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
uri = self.handle.uri
|
|
206
|
+
fs = None
|
|
207
|
+
|
|
208
|
+
# Handle S3
|
|
209
|
+
if self.handle.scheme == "s3":
|
|
210
|
+
opts = self.handle.fs_opts or {}
|
|
211
|
+
kwargs: Dict[str, Any] = {}
|
|
212
|
+
if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
|
|
213
|
+
kwargs["access_key"] = opts["s3_access_key_id"]
|
|
214
|
+
kwargs["secret_key"] = opts["s3_secret_access_key"]
|
|
215
|
+
if opts.get("s3_endpoint"):
|
|
216
|
+
endpoint = opts["s3_endpoint"]
|
|
217
|
+
if endpoint.startswith("http://"):
|
|
218
|
+
endpoint = endpoint[7:]
|
|
219
|
+
kwargs["scheme"] = "http"
|
|
220
|
+
elif endpoint.startswith("https://"):
|
|
221
|
+
endpoint = endpoint[8:]
|
|
222
|
+
kwargs["scheme"] = "https"
|
|
223
|
+
kwargs["endpoint_override"] = endpoint
|
|
224
|
+
if opts.get("s3_url_style", "").lower() == "path" or opts.get("s3_endpoint"):
|
|
225
|
+
kwargs["force_virtual_addressing"] = False
|
|
226
|
+
|
|
227
|
+
fs = pafs.S3FileSystem(**kwargs)
|
|
228
|
+
if uri.lower().startswith("s3://"):
|
|
229
|
+
uri = uri[5:]
|
|
230
|
+
|
|
231
|
+
# Handle Azure (ADLS Gen2, Azure Blob)
|
|
232
|
+
if self.handle.scheme in ("abfs", "abfss", "az"):
|
|
233
|
+
opts = self.handle.fs_opts or {}
|
|
234
|
+
kwargs: Dict[str, Any] = {}
|
|
235
|
+
|
|
236
|
+
if opts.get("azure_account_name"):
|
|
237
|
+
kwargs["account_name"] = opts["azure_account_name"]
|
|
238
|
+
if opts.get("azure_account_key"):
|
|
239
|
+
kwargs["account_key"] = opts["azure_account_key"]
|
|
240
|
+
if opts.get("azure_sas_token"):
|
|
241
|
+
# PyArrow expects SAS token as 'sas_token' credential
|
|
242
|
+
sas = opts["azure_sas_token"]
|
|
243
|
+
if sas.startswith("?"):
|
|
244
|
+
sas = sas[1:]
|
|
245
|
+
kwargs["sas_token"] = sas
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
fs = pafs.AzureFileSystem(**kwargs)
|
|
249
|
+
# Strip scheme prefix for PyArrow
|
|
250
|
+
if uri.lower().startswith("abfss://"):
|
|
251
|
+
uri = uri[8:]
|
|
252
|
+
elif uri.lower().startswith("abfs://"):
|
|
253
|
+
uri = uri[7:]
|
|
254
|
+
elif uri.lower().startswith("az://"):
|
|
255
|
+
uri = uri[5:]
|
|
256
|
+
except Exception:
|
|
257
|
+
# Azure filesystem not available or credentials invalid
|
|
258
|
+
# Fall back to DuckDB-based profiling
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
pf = pq.ParquetFile(uri, filesystem=fs)
|
|
262
|
+
self._parquet_metadata = pf.metadata
|
|
263
|
+
return self._parquet_metadata
|
|
264
|
+
|
|
265
|
+
except Exception:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
def supports_metadata_only(self) -> bool:
|
|
269
|
+
"""
|
|
270
|
+
Check if this backend supports metadata-only profiling.
|
|
271
|
+
|
|
272
|
+
Returns True only for Parquet files when PyArrow is available.
|
|
273
|
+
CSV files don't have metadata statistics.
|
|
274
|
+
"""
|
|
275
|
+
return (
|
|
276
|
+
self.handle.format == "parquet"
|
|
277
|
+
and _HAS_PYARROW
|
|
278
|
+
and self.sample_size is None
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
def profile_metadata_only(
|
|
282
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
283
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
284
|
+
"""
|
|
285
|
+
Profile columns using only Parquet metadata (no data scan).
|
|
286
|
+
|
|
287
|
+
Returns dict mapping column_name -> {null_count, distinct_count, ...}
|
|
288
|
+
|
|
289
|
+
Parquet row group statistics provide:
|
|
290
|
+
- null_count: Exact count of nulls (sum across row groups)
|
|
291
|
+
- num_values: Non-null values per row group
|
|
292
|
+
- min/max: Column min/max (for potential use)
|
|
293
|
+
|
|
294
|
+
Note: Parquet does NOT store distinct_count. We estimate from
|
|
295
|
+
num_values (assuming all non-null values are distinct as upper bound).
|
|
296
|
+
|
|
297
|
+
This is used for the 'lite' preset to achieve fast profiling
|
|
298
|
+
without scanning the actual data.
|
|
299
|
+
"""
|
|
300
|
+
meta = self._get_parquet_metadata()
|
|
301
|
+
if not meta:
|
|
302
|
+
raise RuntimeError("Cannot get Parquet metadata")
|
|
303
|
+
|
|
304
|
+
# Build column stats by aggregating across row groups
|
|
305
|
+
col_stats: Dict[str, Dict[str, Any]] = {}
|
|
306
|
+
|
|
307
|
+
# Initialize stats for each column
|
|
308
|
+
for col_name, _ in schema:
|
|
309
|
+
col_stats[col_name] = {
|
|
310
|
+
"null_count": 0,
|
|
311
|
+
"num_values": 0,
|
|
312
|
+
"has_statistics": False,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Aggregate stats from all row groups
|
|
316
|
+
for rg_idx in range(meta.num_row_groups):
|
|
317
|
+
rg = meta.row_group(rg_idx)
|
|
318
|
+
|
|
319
|
+
for col_idx in range(rg.num_columns):
|
|
320
|
+
col_chunk = rg.column(col_idx)
|
|
321
|
+
# Get column name from path (handles nested columns)
|
|
322
|
+
col_path = col_chunk.path_in_schema
|
|
323
|
+
col_name = col_path.split(".")[-1] if "." in col_path else col_path
|
|
324
|
+
|
|
325
|
+
if col_name not in col_stats:
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
stats = col_chunk.statistics
|
|
329
|
+
if stats is not None:
|
|
330
|
+
col_stats[col_name]["has_statistics"] = True
|
|
331
|
+
if stats.null_count is not None:
|
|
332
|
+
col_stats[col_name]["null_count"] += stats.null_count
|
|
333
|
+
if stats.num_values is not None:
|
|
334
|
+
col_stats[col_name]["num_values"] += stats.num_values
|
|
335
|
+
|
|
336
|
+
# Build result dict
|
|
337
|
+
result: Dict[str, Dict[str, Any]] = {}
|
|
338
|
+
|
|
339
|
+
for col_name, raw_type in schema:
|
|
340
|
+
stats = col_stats.get(col_name, {})
|
|
341
|
+
|
|
342
|
+
null_count = stats.get("null_count", 0)
|
|
343
|
+
num_values = stats.get("num_values", 0)
|
|
344
|
+
has_stats = stats.get("has_statistics", False)
|
|
345
|
+
|
|
346
|
+
# Estimate distinct_count:
|
|
347
|
+
# - If no stats: use non-null count as upper bound
|
|
348
|
+
# - Parquet doesn't track distinct count
|
|
349
|
+
non_null = row_count - null_count if has_stats else row_count
|
|
350
|
+
distinct_count = non_null # Upper bound estimate
|
|
351
|
+
|
|
352
|
+
result[col_name] = {
|
|
353
|
+
"null_count": null_count if has_stats else 0,
|
|
354
|
+
"distinct_count": distinct_count,
|
|
355
|
+
"has_statistics": has_stats,
|
|
356
|
+
"is_estimate": True, # Flag that distinct_count is estimated
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return result
|