kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/scout/profiler.py
ADDED
|
@@ -0,0 +1,801 @@
|
|
|
1
|
+
# src/kontra/scout/profiler.py
|
|
2
|
+
"""
|
|
3
|
+
ScoutProfiler - Contract-free data profiling with pluggable backends.
|
|
4
|
+
|
|
5
|
+
Supports:
|
|
6
|
+
- Parquet and CSV files (local + S3) via DuckDB backend
|
|
7
|
+
- PostgreSQL tables via PostgreSQL backend
|
|
8
|
+
|
|
9
|
+
Efficiency optimizations:
|
|
10
|
+
- Parquet metadata extraction (schema, row count) without data scan
|
|
11
|
+
- PostgreSQL pg_stats for lite preset
|
|
12
|
+
- Single-pass aggregation queries
|
|
13
|
+
- Smart sampling for expensive operations
|
|
14
|
+
- Preset modes for different profiling depths
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import time
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
from kontra.connectors.handle import DatasetHandle
|
|
24
|
+
from kontra.version import VERSION
|
|
25
|
+
|
|
26
|
+
from .types import (
|
|
27
|
+
ColumnProfile,
|
|
28
|
+
DatasetProfile,
|
|
29
|
+
NumericStats,
|
|
30
|
+
StringStats,
|
|
31
|
+
TemporalStats,
|
|
32
|
+
TopValue,
|
|
33
|
+
)
|
|
34
|
+
from .dtype_mapping import normalize_dtype
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Preset configurations
|
|
38
|
+
# New names (v0.7+): scout, scan, interrogate
|
|
39
|
+
# Old names (deprecated): lite, standard, deep
|
|
40
|
+
PRESETS = {
|
|
41
|
+
# --- New preset names ---
|
|
42
|
+
"scout": {
|
|
43
|
+
# Quick recon: schema + row count + basic null/distinct only
|
|
44
|
+
# Uses metadata-only path when available (pg_stats, Parquet footer)
|
|
45
|
+
"include_numeric_stats": False,
|
|
46
|
+
"include_string_stats": False,
|
|
47
|
+
"include_temporal_stats": False,
|
|
48
|
+
"include_top_values": False,
|
|
49
|
+
"include_percentiles": False,
|
|
50
|
+
"top_n": 0,
|
|
51
|
+
"list_values_threshold": 5,
|
|
52
|
+
"metadata_only": True, # Use metadata-only path when backend supports it
|
|
53
|
+
},
|
|
54
|
+
"scan": {
|
|
55
|
+
# Systematic pass: full stats, moderate top values
|
|
56
|
+
# Uses strategic profiling when backend supports it (PostgreSQL)
|
|
57
|
+
"include_numeric_stats": True,
|
|
58
|
+
"include_string_stats": True,
|
|
59
|
+
"include_temporal_stats": True,
|
|
60
|
+
"include_top_values": True,
|
|
61
|
+
"include_percentiles": False,
|
|
62
|
+
"top_n": 5,
|
|
63
|
+
"list_values_threshold": 10,
|
|
64
|
+
"metadata_only": False,
|
|
65
|
+
"strategic_standard": True, # Use smart probing when available
|
|
66
|
+
},
|
|
67
|
+
"interrogate": {
|
|
68
|
+
# Deep investigation: everything including percentiles
|
|
69
|
+
"include_numeric_stats": True,
|
|
70
|
+
"include_string_stats": True,
|
|
71
|
+
"include_temporal_stats": True,
|
|
72
|
+
"include_top_values": True,
|
|
73
|
+
"include_percentiles": True,
|
|
74
|
+
"top_n": 10,
|
|
75
|
+
"list_values_threshold": 20,
|
|
76
|
+
"metadata_only": False,
|
|
77
|
+
},
|
|
78
|
+
# --- Deprecated aliases (for backward compatibility) ---
|
|
79
|
+
"lite": {
|
|
80
|
+
# DEPRECATED: Use "scout" instead
|
|
81
|
+
"include_numeric_stats": False,
|
|
82
|
+
"include_string_stats": False,
|
|
83
|
+
"include_temporal_stats": False,
|
|
84
|
+
"include_top_values": False,
|
|
85
|
+
"include_percentiles": False,
|
|
86
|
+
"top_n": 0,
|
|
87
|
+
"list_values_threshold": 5,
|
|
88
|
+
"metadata_only": True,
|
|
89
|
+
},
|
|
90
|
+
"standard": {
|
|
91
|
+
# DEPRECATED: Use "scan" instead
|
|
92
|
+
"include_numeric_stats": True,
|
|
93
|
+
"include_string_stats": True,
|
|
94
|
+
"include_temporal_stats": True,
|
|
95
|
+
"include_top_values": True,
|
|
96
|
+
"include_percentiles": False,
|
|
97
|
+
"top_n": 5,
|
|
98
|
+
"list_values_threshold": 10,
|
|
99
|
+
"metadata_only": False,
|
|
100
|
+
"strategic_standard": True,
|
|
101
|
+
},
|
|
102
|
+
"deep": {
|
|
103
|
+
# DEPRECATED: Use "interrogate" instead
|
|
104
|
+
"include_numeric_stats": True,
|
|
105
|
+
"include_string_stats": True,
|
|
106
|
+
"include_temporal_stats": True,
|
|
107
|
+
"include_top_values": True,
|
|
108
|
+
"include_percentiles": True,
|
|
109
|
+
"top_n": 10,
|
|
110
|
+
"list_values_threshold": 20,
|
|
111
|
+
"metadata_only": False,
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Mapping from old preset names to new names (for deprecation warnings)
|
|
116
|
+
_DEPRECATED_PRESETS = {
|
|
117
|
+
"lite": "scout",
|
|
118
|
+
"standard": "scan",
|
|
119
|
+
"deep": "interrogate",
|
|
120
|
+
"llm": "scan", # llm preset is removed, recommend scan + to_llm()
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _select_backend(handle: DatasetHandle, sample_size: Optional[int] = None):
|
|
125
|
+
"""
|
|
126
|
+
Select the appropriate backend for the data source.
|
|
127
|
+
|
|
128
|
+
Returns an instance of ProfilerBackend.
|
|
129
|
+
"""
|
|
130
|
+
scheme = (handle.scheme or "").lower()
|
|
131
|
+
|
|
132
|
+
if scheme in ("postgres", "postgresql"):
|
|
133
|
+
from .backends.postgres_backend import PostgreSQLBackend
|
|
134
|
+
return PostgreSQLBackend(handle, sample_size=sample_size)
|
|
135
|
+
|
|
136
|
+
if scheme in ("mssql", "sqlserver"):
|
|
137
|
+
from .backends.sqlserver_backend import SqlServerBackend
|
|
138
|
+
return SqlServerBackend(handle, sample_size=sample_size)
|
|
139
|
+
|
|
140
|
+
# Default to DuckDB for files (parquet, csv, etc.)
|
|
141
|
+
from .backends.duckdb_backend import DuckDBBackend
|
|
142
|
+
return DuckDBBackend(handle, sample_size=sample_size)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _is_numeric(dtype: str) -> bool:
|
|
146
|
+
return dtype in ("int", "float")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _is_string(dtype: str) -> bool:
|
|
150
|
+
return dtype == "string"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _is_temporal(dtype: str) -> bool:
|
|
154
|
+
return dtype in ("date", "datetime", "time")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class ScoutProfiler:
|
|
158
|
+
"""
|
|
159
|
+
Contract-free data profiler with pluggable backends.
|
|
160
|
+
|
|
161
|
+
Supports:
|
|
162
|
+
- Parquet and CSV files (local + S3) via DuckDB backend
|
|
163
|
+
- PostgreSQL tables via PostgreSQL backend
|
|
164
|
+
|
|
165
|
+
Efficiency features:
|
|
166
|
+
- Parquet metadata extraction (row count, schema) without data scan
|
|
167
|
+
- PostgreSQL pg_stats for lite preset
|
|
168
|
+
- Single-pass aggregation queries
|
|
169
|
+
- Preset modes (lite/standard/deep) for different use cases
|
|
170
|
+
- Smart sampling for large datasets
|
|
171
|
+
|
|
172
|
+
Usage:
|
|
173
|
+
# Quick overview
|
|
174
|
+
profiler = ScoutProfiler("data.parquet", preset="lite")
|
|
175
|
+
|
|
176
|
+
# Full analysis
|
|
177
|
+
profiler = ScoutProfiler("data.parquet", preset="deep", include_patterns=True)
|
|
178
|
+
|
|
179
|
+
# PostgreSQL table
|
|
180
|
+
profiler = ScoutProfiler("postgres://user:pass@host/db/public.users")
|
|
181
|
+
|
|
182
|
+
profile = profiler.profile()
|
|
183
|
+
print(profile.to_dict())
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(
|
|
187
|
+
self,
|
|
188
|
+
source_uri: str,
|
|
189
|
+
*,
|
|
190
|
+
preset: Literal["lite", "standard", "deep"] = "standard",
|
|
191
|
+
list_values_threshold: Optional[int] = None,
|
|
192
|
+
top_n: Optional[int] = None,
|
|
193
|
+
sample_size: Optional[int] = None,
|
|
194
|
+
include_patterns: bool = False,
|
|
195
|
+
percentiles: Optional[List[int]] = None,
|
|
196
|
+
columns: Optional[List[str]] = None,
|
|
197
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
198
|
+
):
|
|
199
|
+
"""
|
|
200
|
+
Initialize the profiler.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
source_uri: Path or URI to the dataset (local, s3://, postgres://)
|
|
204
|
+
preset: Profiling depth preset ("lite", "standard", "deep")
|
|
205
|
+
list_values_threshold: List all values if distinct count <= this (overrides preset)
|
|
206
|
+
top_n: Number of top frequent values to include (overrides preset)
|
|
207
|
+
sample_size: If set, sample this many rows for profiling
|
|
208
|
+
include_patterns: Whether to detect patterns (email, uuid, etc.)
|
|
209
|
+
percentiles: List of percentiles to compute (overrides preset)
|
|
210
|
+
columns: Specific columns to profile (default: all)
|
|
211
|
+
storage_options: Cloud storage credentials (S3, Azure, GCS).
|
|
212
|
+
For S3/MinIO: aws_access_key_id, aws_secret_access_key, aws_region, endpoint_url
|
|
213
|
+
For Azure: account_name, account_key, sas_token, etc.
|
|
214
|
+
These override environment variables when provided.
|
|
215
|
+
"""
|
|
216
|
+
self.source_uri = source_uri
|
|
217
|
+
self.handle = DatasetHandle.from_uri(source_uri, storage_options=storage_options)
|
|
218
|
+
self.sample_size = sample_size
|
|
219
|
+
self.include_patterns = include_patterns
|
|
220
|
+
self.columns_filter = columns
|
|
221
|
+
|
|
222
|
+
# Apply preset, then override with explicit args
|
|
223
|
+
if preset not in PRESETS:
|
|
224
|
+
valid_presets = ["scout", "scan", "interrogate"]
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Invalid preset '{preset}'. Valid presets: {', '.join(valid_presets)}"
|
|
227
|
+
)
|
|
228
|
+
preset_config = PRESETS[preset]
|
|
229
|
+
self.list_values_threshold = (
|
|
230
|
+
list_values_threshold
|
|
231
|
+
if list_values_threshold is not None
|
|
232
|
+
else preset_config["list_values_threshold"]
|
|
233
|
+
)
|
|
234
|
+
self.top_n = top_n if top_n is not None else preset_config["top_n"]
|
|
235
|
+
self.include_numeric_stats = preset_config["include_numeric_stats"]
|
|
236
|
+
self.include_string_stats = preset_config["include_string_stats"]
|
|
237
|
+
self.include_temporal_stats = preset_config["include_temporal_stats"]
|
|
238
|
+
self.include_top_values = preset_config["include_top_values"]
|
|
239
|
+
self.include_percentiles = preset_config["include_percentiles"]
|
|
240
|
+
|
|
241
|
+
# Percentiles (only used if include_percentiles is True)
|
|
242
|
+
self.percentiles = percentiles or [25, 50, 75, 99]
|
|
243
|
+
|
|
244
|
+
# Metadata-only mode (for lite preset)
|
|
245
|
+
self.metadata_only = preset_config.get("metadata_only", False)
|
|
246
|
+
|
|
247
|
+
# Strategic standard mode (for standard preset on PostgreSQL)
|
|
248
|
+
self.strategic_standard = preset_config.get("strategic_standard", False)
|
|
249
|
+
|
|
250
|
+
# Backend is created on profile() call
|
|
251
|
+
self.backend = None
|
|
252
|
+
|
|
253
|
+
def profile(self) -> DatasetProfile:
|
|
254
|
+
"""Execute profiling and return structured results."""
|
|
255
|
+
t0 = time.perf_counter()
|
|
256
|
+
|
|
257
|
+
# Create backend
|
|
258
|
+
self.backend = _select_backend(self.handle, sample_size=self.sample_size)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# Connect to data source
|
|
262
|
+
self.backend.connect()
|
|
263
|
+
|
|
264
|
+
# 1. Get schema (column names and types)
|
|
265
|
+
schema = self.backend.get_schema()
|
|
266
|
+
|
|
267
|
+
# Filter columns if specified
|
|
268
|
+
if self.columns_filter:
|
|
269
|
+
schema = [(n, t) for n, t in schema if n in self.columns_filter]
|
|
270
|
+
|
|
271
|
+
# 2. Get row count (backend handles optimization)
|
|
272
|
+
row_count = self.backend.get_row_count()
|
|
273
|
+
|
|
274
|
+
# 3. Get estimated size (if available)
|
|
275
|
+
estimated_size = self.backend.get_estimated_size_bytes()
|
|
276
|
+
|
|
277
|
+
# 4. Profile each column (single-pass aggregation)
|
|
278
|
+
column_profiles = self._profile_columns(schema, row_count)
|
|
279
|
+
|
|
280
|
+
# 5. Optionally detect patterns (sampling-based, efficient)
|
|
281
|
+
if self.include_patterns:
|
|
282
|
+
self._detect_patterns(column_profiles)
|
|
283
|
+
|
|
284
|
+
# 6. Infer semantic types
|
|
285
|
+
self._infer_semantic_types(column_profiles)
|
|
286
|
+
|
|
287
|
+
duration_ms = int((time.perf_counter() - t0) * 1000)
|
|
288
|
+
|
|
289
|
+
return DatasetProfile(
|
|
290
|
+
source_uri=self.source_uri,
|
|
291
|
+
source_format=self.backend.source_format,
|
|
292
|
+
profiled_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
293
|
+
engine_version=VERSION,
|
|
294
|
+
row_count=row_count,
|
|
295
|
+
column_count=len(column_profiles),
|
|
296
|
+
estimated_size_bytes=estimated_size,
|
|
297
|
+
sampled=self.sample_size is not None,
|
|
298
|
+
sample_size=self.sample_size,
|
|
299
|
+
columns=column_profiles,
|
|
300
|
+
profile_duration_ms=duration_ms,
|
|
301
|
+
)
|
|
302
|
+
finally:
|
|
303
|
+
if self.backend:
|
|
304
|
+
self.backend.close()
|
|
305
|
+
|
|
306
|
+
def _profile_columns(
|
|
307
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
308
|
+
) -> List[ColumnProfile]:
|
|
309
|
+
"""Build single compound query for all column statistics."""
|
|
310
|
+
if not schema:
|
|
311
|
+
return []
|
|
312
|
+
|
|
313
|
+
# Check if we can use metadata-only path (faster, no table scan)
|
|
314
|
+
use_metadata_only = (
|
|
315
|
+
self.metadata_only
|
|
316
|
+
and hasattr(self.backend, "supports_metadata_only")
|
|
317
|
+
and self.backend.supports_metadata_only()
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
if use_metadata_only:
|
|
321
|
+
return self._profile_columns_from_metadata(schema, row_count)
|
|
322
|
+
|
|
323
|
+
# Check if we can use strategic standard path (PostgreSQL optimization)
|
|
324
|
+
use_strategic_standard = (
|
|
325
|
+
self.strategic_standard
|
|
326
|
+
and hasattr(self.backend, "supports_strategic_standard")
|
|
327
|
+
and self.backend.supports_strategic_standard()
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
if use_strategic_standard:
|
|
331
|
+
return self._profile_columns_strategic(schema, row_count)
|
|
332
|
+
|
|
333
|
+
# Build aggregation expressions for each column
|
|
334
|
+
exprs: List[str] = []
|
|
335
|
+
col_info: List[Tuple[str, str, str]] = [] # (name, raw_type, normalized_type)
|
|
336
|
+
|
|
337
|
+
for col_name, raw_type in schema:
|
|
338
|
+
dtype = normalize_dtype(raw_type)
|
|
339
|
+
col_info.append((col_name, raw_type, dtype))
|
|
340
|
+
col_exprs = self._build_column_agg_exprs(col_name, dtype)
|
|
341
|
+
exprs.extend(col_exprs)
|
|
342
|
+
|
|
343
|
+
# Execute single aggregate query via backend
|
|
344
|
+
results = self.backend.execute_stats_query(exprs)
|
|
345
|
+
|
|
346
|
+
# Build ColumnProfile objects
|
|
347
|
+
profiles: List[ColumnProfile] = []
|
|
348
|
+
for col_name, raw_type, dtype in col_info:
|
|
349
|
+
profile = self._build_column_profile(
|
|
350
|
+
col_name, raw_type, dtype, results, row_count
|
|
351
|
+
)
|
|
352
|
+
profiles.append(profile)
|
|
353
|
+
|
|
354
|
+
# Fetch top values and low-cardinality values
|
|
355
|
+
for profile in profiles:
|
|
356
|
+
self._fetch_top_values(profile, row_count)
|
|
357
|
+
if profile.distinct_count <= self.list_values_threshold:
|
|
358
|
+
self._fetch_all_values(profile)
|
|
359
|
+
|
|
360
|
+
return profiles
|
|
361
|
+
|
|
362
|
+
def _profile_columns_from_metadata(
|
|
363
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
364
|
+
) -> List[ColumnProfile]:
|
|
365
|
+
"""
|
|
366
|
+
Profile columns using metadata only (no table scan).
|
|
367
|
+
|
|
368
|
+
Used for 'lite' preset when backend supports it (PostgreSQL pg_stats, Parquet footer).
|
|
369
|
+
Returns estimates, not exact counts.
|
|
370
|
+
"""
|
|
371
|
+
# Get metadata from backend
|
|
372
|
+
metadata = self.backend.profile_metadata_only(schema, row_count)
|
|
373
|
+
|
|
374
|
+
profiles: List[ColumnProfile] = []
|
|
375
|
+
for col_name, raw_type in schema:
|
|
376
|
+
dtype = normalize_dtype(raw_type)
|
|
377
|
+
col_meta = metadata.get(col_name, {})
|
|
378
|
+
|
|
379
|
+
null_count = col_meta.get("null_count", 0)
|
|
380
|
+
distinct_count = col_meta.get("distinct_count", 0)
|
|
381
|
+
|
|
382
|
+
non_null_count = row_count - null_count
|
|
383
|
+
null_rate = null_count / row_count if row_count > 0 else 0.0
|
|
384
|
+
uniqueness_ratio = (
|
|
385
|
+
distinct_count / non_null_count if non_null_count > 0 else 0.0
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
profile = ColumnProfile(
|
|
389
|
+
name=col_name,
|
|
390
|
+
dtype=dtype,
|
|
391
|
+
dtype_raw=raw_type,
|
|
392
|
+
row_count=row_count,
|
|
393
|
+
null_count=null_count,
|
|
394
|
+
null_rate=null_rate,
|
|
395
|
+
distinct_count=distinct_count,
|
|
396
|
+
uniqueness_ratio=uniqueness_ratio,
|
|
397
|
+
is_low_cardinality=distinct_count <= self.list_values_threshold,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Use most_common_vals from pg_stats for low-cardinality columns
|
|
401
|
+
mcv = col_meta.get("most_common_vals")
|
|
402
|
+
if mcv and profile.is_low_cardinality:
|
|
403
|
+
profile.values = mcv
|
|
404
|
+
|
|
405
|
+
profiles.append(profile)
|
|
406
|
+
|
|
407
|
+
return profiles
|
|
408
|
+
|
|
409
|
+
def _profile_columns_strategic(
|
|
410
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
411
|
+
) -> List[ColumnProfile]:
|
|
412
|
+
"""
|
|
413
|
+
Profile columns using strategic queries (PostgreSQL optimization).
|
|
414
|
+
|
|
415
|
+
This method optimizes standard preset for PostgreSQL by:
|
|
416
|
+
1. Using metadata (pg_stats) for null/distinct counts
|
|
417
|
+
2. Classifying columns by cardinality to choose optimal strategy
|
|
418
|
+
3. Using TABLESAMPLE SYSTEM (not BERNOULLI) for numeric stats
|
|
419
|
+
4. Batching low-cardinality GROUP BY queries
|
|
420
|
+
5. Trusting pg_stats MCVs for high-cardinality columns
|
|
421
|
+
|
|
422
|
+
Much faster than full table scan approach.
|
|
423
|
+
"""
|
|
424
|
+
import os
|
|
425
|
+
|
|
426
|
+
# Step 1: Get freshness info
|
|
427
|
+
freshness = self.backend.get_table_freshness()
|
|
428
|
+
is_fresh = freshness.get("is_fresh", False)
|
|
429
|
+
|
|
430
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
431
|
+
stale_ratio = freshness.get("stale_ratio", 1.0)
|
|
432
|
+
print(f"[INFO] PostgreSQL stats freshness: stale_ratio={stale_ratio:.2f}, is_fresh={is_fresh}")
|
|
433
|
+
|
|
434
|
+
# Step 2: Get metadata (null/distinct) and classify columns
|
|
435
|
+
metadata = self.backend.profile_metadata_only(schema, row_count)
|
|
436
|
+
classification = self.backend.classify_columns(schema, row_count)
|
|
437
|
+
|
|
438
|
+
# Step 3: Build profile objects with metadata
|
|
439
|
+
profiles: List[ColumnProfile] = []
|
|
440
|
+
numeric_cols = []
|
|
441
|
+
low_cardinality_cols = []
|
|
442
|
+
|
|
443
|
+
for col_name, raw_type in schema:
|
|
444
|
+
dtype = normalize_dtype(raw_type)
|
|
445
|
+
col_meta = metadata.get(col_name, {})
|
|
446
|
+
col_class = classification.get(col_name, {})
|
|
447
|
+
|
|
448
|
+
null_count = col_meta.get("null_count", 0)
|
|
449
|
+
distinct_count = col_meta.get("distinct_count", 0)
|
|
450
|
+
|
|
451
|
+
non_null_count = row_count - null_count
|
|
452
|
+
null_rate = null_count / row_count if row_count > 0 else 0.0
|
|
453
|
+
uniqueness_ratio = (
|
|
454
|
+
distinct_count / non_null_count if non_null_count > 0 else 0.0
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
profile = ColumnProfile(
|
|
458
|
+
name=col_name,
|
|
459
|
+
dtype=dtype,
|
|
460
|
+
dtype_raw=raw_type,
|
|
461
|
+
row_count=row_count,
|
|
462
|
+
null_count=null_count,
|
|
463
|
+
null_rate=null_rate,
|
|
464
|
+
distinct_count=distinct_count,
|
|
465
|
+
uniqueness_ratio=uniqueness_ratio,
|
|
466
|
+
is_low_cardinality=distinct_count <= self.list_values_threshold,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Track columns needing additional queries
|
|
470
|
+
if _is_numeric(dtype) and self.include_numeric_stats:
|
|
471
|
+
numeric_cols.append((col_name, profile))
|
|
472
|
+
|
|
473
|
+
if col_class.get("strategy") == "group_by":
|
|
474
|
+
low_cardinality_cols.append(col_name)
|
|
475
|
+
elif col_class.get("strategy") == "metadata_only":
|
|
476
|
+
# Use MCVs from pg_stats for top_values
|
|
477
|
+
mcv = col_meta.get("most_common_vals")
|
|
478
|
+
if mcv and self.include_top_values:
|
|
479
|
+
profile.top_values = [
|
|
480
|
+
TopValue(value=v, count=0, pct=0.0)
|
|
481
|
+
for v in mcv[:self.top_n]
|
|
482
|
+
]
|
|
483
|
+
if profile.is_low_cardinality:
|
|
484
|
+
profile.values = mcv
|
|
485
|
+
|
|
486
|
+
profiles.append(profile)
|
|
487
|
+
|
|
488
|
+
# Step 4: Numeric stats via TABLESAMPLE SYSTEM (fast block sampling)
|
|
489
|
+
if numeric_cols:
|
|
490
|
+
numeric_exprs = []
|
|
491
|
+
# SQL Server uses STDEV, PostgreSQL/DuckDB use STDDEV
|
|
492
|
+
is_duckdb = self.backend.source_format in ("parquet", "csv", "duckdb")
|
|
493
|
+
stddev_fn = "STDEV" if self.backend.source_format == "sqlserver" else "STDDEV"
|
|
494
|
+
for col_name, _ in numeric_cols:
|
|
495
|
+
c = self.backend.esc_ident(col_name)
|
|
496
|
+
# DuckDB: Filter out infinity values to prevent overflow errors
|
|
497
|
+
if is_duckdb:
|
|
498
|
+
finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
|
|
499
|
+
numeric_exprs.extend([
|
|
500
|
+
f"MIN({finite_col}) AS {self.backend.esc_ident(f'__min__{col_name}')}",
|
|
501
|
+
f"MAX({finite_col}) AS {self.backend.esc_ident(f'__max__{col_name}')}",
|
|
502
|
+
f"AVG({finite_col}) AS {self.backend.esc_ident(f'__mean__{col_name}')}",
|
|
503
|
+
f"{stddev_fn}({finite_col}) AS {self.backend.esc_ident(f'__std__{col_name}')}",
|
|
504
|
+
])
|
|
505
|
+
else:
|
|
506
|
+
numeric_exprs.extend([
|
|
507
|
+
f"MIN({c}) AS {self.backend.esc_ident(f'__min__{col_name}')}",
|
|
508
|
+
f"MAX({c}) AS {self.backend.esc_ident(f'__max__{col_name}')}",
|
|
509
|
+
f"AVG({c}) AS {self.backend.esc_ident(f'__mean__{col_name}')}",
|
|
510
|
+
f"{stddev_fn}({c}) AS {self.backend.esc_ident(f'__std__{col_name}')}",
|
|
511
|
+
])
|
|
512
|
+
|
|
513
|
+
# Use SYSTEM sampling (block-level) - much faster than BERNOULLI
|
|
514
|
+
# If stats are fresh, use smaller sample; if stale, use larger sample
|
|
515
|
+
sample_pct = 1.0 if is_fresh else 5.0
|
|
516
|
+
numeric_results = self.backend.execute_sampled_stats_query(
|
|
517
|
+
numeric_exprs, sample_pct=sample_pct
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Populate numeric stats
|
|
521
|
+
for col_name, profile in numeric_cols:
|
|
522
|
+
profile.numeric = NumericStats(
|
|
523
|
+
min=self._to_float(numeric_results.get(f"__min__{col_name}")),
|
|
524
|
+
max=self._to_float(numeric_results.get(f"__max__{col_name}")),
|
|
525
|
+
mean=self._to_float(numeric_results.get(f"__mean__{col_name}")),
|
|
526
|
+
std=self._to_float(numeric_results.get(f"__std__{col_name}")),
|
|
527
|
+
median=None, # Skip median in strategic mode (expensive)
|
|
528
|
+
percentiles={},
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Step 5: Low-cardinality columns via batched GROUP BY
|
|
532
|
+
if low_cardinality_cols and self.include_top_values:
|
|
533
|
+
low_card_values = self.backend.fetch_low_cardinality_values_batched(
|
|
534
|
+
low_cardinality_cols
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Populate values and top_values
|
|
538
|
+
for profile in profiles:
|
|
539
|
+
if profile.name in low_card_values:
|
|
540
|
+
values_with_counts = low_card_values[profile.name]
|
|
541
|
+
profile.values = [v for v, _ in values_with_counts]
|
|
542
|
+
profile.top_values = [
|
|
543
|
+
TopValue(
|
|
544
|
+
value=v,
|
|
545
|
+
count=c,
|
|
546
|
+
pct=(c / row_count * 100) if row_count > 0 else 0.0,
|
|
547
|
+
)
|
|
548
|
+
for v, c in values_with_counts[:self.top_n]
|
|
549
|
+
]
|
|
550
|
+
|
|
551
|
+
# Step 6: Medium cardinality - sample top values
|
|
552
|
+
medium_card_cols = [
|
|
553
|
+
p.name for p in profiles
|
|
554
|
+
if classification.get(p.name, {}).get("strategy") == "sample"
|
|
555
|
+
and p.top_values is None
|
|
556
|
+
]
|
|
557
|
+
|
|
558
|
+
if medium_card_cols and self.include_top_values:
|
|
559
|
+
for col_name in medium_card_cols:
|
|
560
|
+
profile = next(p for p in profiles if p.name == col_name)
|
|
561
|
+
try:
|
|
562
|
+
rows = self.backend.fetch_top_values(col_name, self.top_n)
|
|
563
|
+
profile.top_values = [
|
|
564
|
+
TopValue(
|
|
565
|
+
value=val,
|
|
566
|
+
count=int(cnt),
|
|
567
|
+
pct=(int(cnt) / row_count * 100) if row_count > 0 else 0.0,
|
|
568
|
+
)
|
|
569
|
+
for val, cnt in rows
|
|
570
|
+
]
|
|
571
|
+
except Exception:
|
|
572
|
+
pass
|
|
573
|
+
|
|
574
|
+
return profiles
|
|
575
|
+
|
|
576
|
+
def _build_column_agg_exprs(self, col: str, dtype: str) -> List[str]:
|
|
577
|
+
"""Generate SQL expressions for a single column's statistics."""
|
|
578
|
+
esc = self.backend.esc_ident
|
|
579
|
+
c = esc(col)
|
|
580
|
+
source_fmt = getattr(self.backend, "source_format", "")
|
|
581
|
+
is_sqlserver = source_fmt == "sqlserver"
|
|
582
|
+
is_duckdb = source_fmt in ("parquet", "csv", "duckdb")
|
|
583
|
+
|
|
584
|
+
# Core stats: always included (null count, distinct count)
|
|
585
|
+
exprs = [
|
|
586
|
+
f"COUNT(*) - COUNT({c}) AS {esc(f'__null__{col}')}",
|
|
587
|
+
f"COUNT(DISTINCT {c}) AS {esc(f'__distinct__{col}')}",
|
|
588
|
+
]
|
|
589
|
+
|
|
590
|
+
# Numeric stats: controlled by preset
|
|
591
|
+
if _is_numeric(dtype) and self.include_numeric_stats:
|
|
592
|
+
# SQL Server: Cast to FLOAT to prevent overflow on large tables
|
|
593
|
+
avg_expr = f"AVG(CAST({c} AS FLOAT))" if is_sqlserver else f"AVG({c})"
|
|
594
|
+
# DuckDB: Filter out infinity values to prevent overflow errors
|
|
595
|
+
if is_duckdb:
|
|
596
|
+
finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
|
|
597
|
+
exprs.extend([
|
|
598
|
+
f"MIN({finite_col}) AS {esc(f'__min__{col}')}",
|
|
599
|
+
f"MAX({finite_col}) AS {esc(f'__max__{col}')}",
|
|
600
|
+
f"AVG({finite_col}) AS {esc(f'__mean__{col}')}",
|
|
601
|
+
])
|
|
602
|
+
else:
|
|
603
|
+
exprs.extend([
|
|
604
|
+
f"MIN({c}) AS {esc(f'__min__{col}')}",
|
|
605
|
+
f"MAX({c}) AS {esc(f'__max__{col}')}",
|
|
606
|
+
f"{avg_expr} AS {esc(f'__mean__{col}')}",
|
|
607
|
+
])
|
|
608
|
+
# SQL Server requires different PERCENTILE_CONT syntax (window function)
|
|
609
|
+
# Skip median/percentiles for SQL Server - use STDEV instead of STDDEV
|
|
610
|
+
if is_sqlserver:
|
|
611
|
+
exprs.append(f"STDEV({c}) AS {esc(f'__std__{col}')}")
|
|
612
|
+
elif is_duckdb:
|
|
613
|
+
finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
|
|
614
|
+
exprs.extend([
|
|
615
|
+
f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {finite_col}) AS {esc(f'__median__{col}')}",
|
|
616
|
+
f"STDDEV({finite_col}) AS {esc(f'__std__{col}')}",
|
|
617
|
+
])
|
|
618
|
+
else:
|
|
619
|
+
exprs.extend([
|
|
620
|
+
f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {c}) AS {esc(f'__median__{col}')}",
|
|
621
|
+
f"STDDEV({c}) AS {esc(f'__std__{col}')}",
|
|
622
|
+
])
|
|
623
|
+
# Additional percentiles: expensive, only in deep preset
|
|
624
|
+
if self.include_percentiles:
|
|
625
|
+
for p in self.percentiles:
|
|
626
|
+
if p != 50: # 50th is already the median
|
|
627
|
+
exprs.append(
|
|
628
|
+
f"PERCENTILE_CONT({p / 100}) WITHIN GROUP (ORDER BY {c}) "
|
|
629
|
+
f"AS {esc(f'__p{p}__{col}')}"
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# String stats: controlled by preset
|
|
633
|
+
if _is_string(dtype) and self.include_string_stats:
|
|
634
|
+
# SQL Server uses LEN(), others use LENGTH()
|
|
635
|
+
len_fn = "LEN" if is_sqlserver else "LENGTH"
|
|
636
|
+
# SQL Server needs BIGINT cast to prevent overflow on large tables
|
|
637
|
+
sum_cast = "CAST(1 AS BIGINT)" if is_sqlserver else "1"
|
|
638
|
+
exprs.extend([
|
|
639
|
+
f"MIN({len_fn}({c})) AS {esc(f'__minlen__{col}')}",
|
|
640
|
+
f"MAX({len_fn}({c})) AS {esc(f'__maxlen__{col}')}",
|
|
641
|
+
f"AVG(CAST({len_fn}({c}) AS FLOAT)) AS {esc(f'__avglen__{col}')}",
|
|
642
|
+
f"SUM(CASE WHEN {c} = '' THEN {sum_cast} ELSE 0 END) AS {esc(f'__empty__{col}')}",
|
|
643
|
+
])
|
|
644
|
+
|
|
645
|
+
# Temporal stats: controlled by preset
|
|
646
|
+
if _is_temporal(dtype) and self.include_temporal_stats:
|
|
647
|
+
exprs.extend([
|
|
648
|
+
f"MIN({c}) AS {esc(f'__datemin__{col}')}",
|
|
649
|
+
f"MAX({c}) AS {esc(f'__datemax__{col}')}",
|
|
650
|
+
])
|
|
651
|
+
|
|
652
|
+
return exprs
|
|
653
|
+
|
|
654
|
+
def _build_column_profile(
|
|
655
|
+
self,
|
|
656
|
+
col_name: str,
|
|
657
|
+
raw_type: str,
|
|
658
|
+
dtype: str,
|
|
659
|
+
results: Dict[str, Any],
|
|
660
|
+
row_count: int,
|
|
661
|
+
) -> ColumnProfile:
|
|
662
|
+
"""Build a ColumnProfile from aggregation results."""
|
|
663
|
+
null_count = int(results.get(f"__null__{col_name}", 0) or 0)
|
|
664
|
+
distinct_count = int(results.get(f"__distinct__{col_name}", 0) or 0)
|
|
665
|
+
|
|
666
|
+
non_null_count = row_count - null_count
|
|
667
|
+
null_rate = null_count / row_count if row_count > 0 else 0.0
|
|
668
|
+
uniqueness_ratio = (
|
|
669
|
+
distinct_count / non_null_count if non_null_count > 0 else 0.0
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
profile = ColumnProfile(
|
|
673
|
+
name=col_name,
|
|
674
|
+
dtype=dtype,
|
|
675
|
+
dtype_raw=raw_type,
|
|
676
|
+
row_count=row_count,
|
|
677
|
+
null_count=null_count,
|
|
678
|
+
null_rate=null_rate,
|
|
679
|
+
distinct_count=distinct_count,
|
|
680
|
+
uniqueness_ratio=uniqueness_ratio,
|
|
681
|
+
is_low_cardinality=distinct_count <= self.list_values_threshold,
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
# Add type-specific stats (only if included by preset)
|
|
685
|
+
if _is_numeric(dtype) and self.include_numeric_stats:
|
|
686
|
+
percentiles = {}
|
|
687
|
+
if self.include_percentiles:
|
|
688
|
+
for p in self.percentiles:
|
|
689
|
+
val = results.get(f"__p{p}__{col_name}")
|
|
690
|
+
if val is not None:
|
|
691
|
+
percentiles[f"p{p}"] = float(val)
|
|
692
|
+
|
|
693
|
+
profile.numeric = NumericStats(
|
|
694
|
+
min=self._to_float(results.get(f"__min__{col_name}")),
|
|
695
|
+
max=self._to_float(results.get(f"__max__{col_name}")),
|
|
696
|
+
mean=self._to_float(results.get(f"__mean__{col_name}")),
|
|
697
|
+
median=self._to_float(results.get(f"__median__{col_name}")),
|
|
698
|
+
std=self._to_float(results.get(f"__std__{col_name}")),
|
|
699
|
+
percentiles=percentiles,
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
if _is_string(dtype) and self.include_string_stats:
|
|
703
|
+
profile.string = StringStats(
|
|
704
|
+
min_length=self._to_int(results.get(f"__minlen__{col_name}")),
|
|
705
|
+
max_length=self._to_int(results.get(f"__maxlen__{col_name}")),
|
|
706
|
+
avg_length=self._to_float(results.get(f"__avglen__{col_name}")),
|
|
707
|
+
empty_count=self._to_int(results.get(f"__empty__{col_name}")) or 0,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
if _is_temporal(dtype) and self.include_temporal_stats:
|
|
711
|
+
date_min = results.get(f"__datemin__{col_name}")
|
|
712
|
+
date_max = results.get(f"__datemax__{col_name}")
|
|
713
|
+
profile.temporal = TemporalStats(
|
|
714
|
+
date_min=str(date_min) if date_min else None,
|
|
715
|
+
date_max=str(date_max) if date_max else None,
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
return profile
|
|
719
|
+
|
|
720
|
+
def _fetch_top_values(self, profile: ColumnProfile, row_count: int) -> None:
|
|
721
|
+
"""Fetch top N most frequent values for a column."""
|
|
722
|
+
# Skip if top values not requested or top_n is 0
|
|
723
|
+
if not self.include_top_values or self.top_n <= 0:
|
|
724
|
+
return
|
|
725
|
+
if row_count == 0:
|
|
726
|
+
return
|
|
727
|
+
|
|
728
|
+
try:
|
|
729
|
+
rows = self.backend.fetch_top_values(profile.name, self.top_n)
|
|
730
|
+
profile.top_values = [
|
|
731
|
+
TopValue(
|
|
732
|
+
value=val,
|
|
733
|
+
count=int(cnt),
|
|
734
|
+
pct=(int(cnt) / row_count * 100) if row_count > 0 else 0.0,
|
|
735
|
+
)
|
|
736
|
+
for val, cnt in rows
|
|
737
|
+
]
|
|
738
|
+
except Exception:
|
|
739
|
+
# Some types may not be groupable
|
|
740
|
+
pass
|
|
741
|
+
|
|
742
|
+
def _fetch_all_values(self, profile: ColumnProfile) -> None:
|
|
743
|
+
"""Fetch all distinct values for low-cardinality columns."""
|
|
744
|
+
try:
|
|
745
|
+
profile.values = self.backend.fetch_distinct_values(profile.name)
|
|
746
|
+
except Exception:
|
|
747
|
+
# Some types may not be sortable
|
|
748
|
+
pass
|
|
749
|
+
|
|
750
|
+
def _detect_patterns(self, profiles: List[ColumnProfile]) -> None:
|
|
751
|
+
"""Detect common patterns in string columns."""
|
|
752
|
+
from .patterns import detect_patterns
|
|
753
|
+
|
|
754
|
+
for profile in profiles:
|
|
755
|
+
if profile.dtype != "string" or profile.distinct_count == 0:
|
|
756
|
+
continue
|
|
757
|
+
|
|
758
|
+
try:
|
|
759
|
+
sample = self.backend.fetch_sample_values(profile.name, 100)
|
|
760
|
+
sample = [str(v) for v in sample if v is not None]
|
|
761
|
+
if sample:
|
|
762
|
+
profile.detected_patterns = detect_patterns(sample)
|
|
763
|
+
except Exception:
|
|
764
|
+
pass
|
|
765
|
+
|
|
766
|
+
def _infer_semantic_types(self, profiles: List[ColumnProfile]) -> None:
|
|
767
|
+
"""Infer semantic type for each column based on profile data."""
|
|
768
|
+
for profile in profiles:
|
|
769
|
+
# Primary key / identifier candidate
|
|
770
|
+
if profile.uniqueness_ratio >= 0.99 and profile.null_rate == 0:
|
|
771
|
+
profile.semantic_type = "identifier"
|
|
772
|
+
# Category (low cardinality, non-numeric)
|
|
773
|
+
elif profile.is_low_cardinality and profile.dtype == "string":
|
|
774
|
+
profile.semantic_type = "category"
|
|
775
|
+
# Measure (numeric, non-low-cardinality)
|
|
776
|
+
elif profile.dtype in ("int", "float") and not profile.is_low_cardinality:
|
|
777
|
+
profile.semantic_type = "measure"
|
|
778
|
+
# Timestamp
|
|
779
|
+
elif profile.dtype in ("date", "datetime"):
|
|
780
|
+
profile.semantic_type = "timestamp"
|
|
781
|
+
# Boolean as category
|
|
782
|
+
elif profile.dtype == "bool":
|
|
783
|
+
profile.semantic_type = "category"
|
|
784
|
+
|
|
785
|
+
@staticmethod
|
|
786
|
+
def _to_float(val: Any) -> Optional[float]:
|
|
787
|
+
if val is None:
|
|
788
|
+
return None
|
|
789
|
+
try:
|
|
790
|
+
return float(val)
|
|
791
|
+
except (TypeError, ValueError):
|
|
792
|
+
return None
|
|
793
|
+
|
|
794
|
+
@staticmethod
|
|
795
|
+
def _to_int(val: Any) -> Optional[int]:
|
|
796
|
+
if val is None:
|
|
797
|
+
return None
|
|
798
|
+
try:
|
|
799
|
+
return int(val)
|
|
800
|
+
except (TypeError, ValueError):
|
|
801
|
+
return None
|