kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,801 @@
1
+ # src/kontra/scout/profiler.py
2
+ """
3
+ ScoutProfiler - Contract-free data profiling with pluggable backends.
4
+
5
+ Supports:
6
+ - Parquet and CSV files (local + S3) via DuckDB backend
7
+ - PostgreSQL tables via PostgreSQL backend
8
+
9
+ Efficiency optimizations:
10
+ - Parquet metadata extraction (schema, row count) without data scan
11
+ - PostgreSQL pg_stats for lite preset
12
+ - Single-pass aggregation queries
13
+ - Smart sampling for expensive operations
14
+ - Preset modes for different profiling depths
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import time
20
+ from datetime import datetime, timezone
21
+ from typing import Any, Dict, List, Literal, Optional, Tuple
22
+
23
+ from kontra.connectors.handle import DatasetHandle
24
+ from kontra.version import VERSION
25
+
26
+ from .types import (
27
+ ColumnProfile,
28
+ DatasetProfile,
29
+ NumericStats,
30
+ StringStats,
31
+ TemporalStats,
32
+ TopValue,
33
+ )
34
+ from .dtype_mapping import normalize_dtype
35
+
36
+
37
+ # Preset configurations
38
+ # New names (v0.7+): scout, scan, interrogate
39
+ # Old names (deprecated): lite, standard, deep
40
+ PRESETS = {
41
+ # --- New preset names ---
42
+ "scout": {
43
+ # Quick recon: schema + row count + basic null/distinct only
44
+ # Uses metadata-only path when available (pg_stats, Parquet footer)
45
+ "include_numeric_stats": False,
46
+ "include_string_stats": False,
47
+ "include_temporal_stats": False,
48
+ "include_top_values": False,
49
+ "include_percentiles": False,
50
+ "top_n": 0,
51
+ "list_values_threshold": 5,
52
+ "metadata_only": True, # Use metadata-only path when backend supports it
53
+ },
54
+ "scan": {
55
+ # Systematic pass: full stats, moderate top values
56
+ # Uses strategic profiling when backend supports it (PostgreSQL)
57
+ "include_numeric_stats": True,
58
+ "include_string_stats": True,
59
+ "include_temporal_stats": True,
60
+ "include_top_values": True,
61
+ "include_percentiles": False,
62
+ "top_n": 5,
63
+ "list_values_threshold": 10,
64
+ "metadata_only": False,
65
+ "strategic_standard": True, # Use smart probing when available
66
+ },
67
+ "interrogate": {
68
+ # Deep investigation: everything including percentiles
69
+ "include_numeric_stats": True,
70
+ "include_string_stats": True,
71
+ "include_temporal_stats": True,
72
+ "include_top_values": True,
73
+ "include_percentiles": True,
74
+ "top_n": 10,
75
+ "list_values_threshold": 20,
76
+ "metadata_only": False,
77
+ },
78
+ # --- Deprecated aliases (for backward compatibility) ---
79
+ "lite": {
80
+ # DEPRECATED: Use "scout" instead
81
+ "include_numeric_stats": False,
82
+ "include_string_stats": False,
83
+ "include_temporal_stats": False,
84
+ "include_top_values": False,
85
+ "include_percentiles": False,
86
+ "top_n": 0,
87
+ "list_values_threshold": 5,
88
+ "metadata_only": True,
89
+ },
90
+ "standard": {
91
+ # DEPRECATED: Use "scan" instead
92
+ "include_numeric_stats": True,
93
+ "include_string_stats": True,
94
+ "include_temporal_stats": True,
95
+ "include_top_values": True,
96
+ "include_percentiles": False,
97
+ "top_n": 5,
98
+ "list_values_threshold": 10,
99
+ "metadata_only": False,
100
+ "strategic_standard": True,
101
+ },
102
+ "deep": {
103
+ # DEPRECATED: Use "interrogate" instead
104
+ "include_numeric_stats": True,
105
+ "include_string_stats": True,
106
+ "include_temporal_stats": True,
107
+ "include_top_values": True,
108
+ "include_percentiles": True,
109
+ "top_n": 10,
110
+ "list_values_threshold": 20,
111
+ "metadata_only": False,
112
+ },
113
+ }
114
+
115
+ # Mapping from old preset names to new names (for deprecation warnings)
116
+ _DEPRECATED_PRESETS = {
117
+ "lite": "scout",
118
+ "standard": "scan",
119
+ "deep": "interrogate",
120
+ "llm": "scan", # llm preset is removed, recommend scan + to_llm()
121
+ }
122
+
123
+
124
+ def _select_backend(handle: DatasetHandle, sample_size: Optional[int] = None):
125
+ """
126
+ Select the appropriate backend for the data source.
127
+
128
+ Returns an instance of ProfilerBackend.
129
+ """
130
+ scheme = (handle.scheme or "").lower()
131
+
132
+ if scheme in ("postgres", "postgresql"):
133
+ from .backends.postgres_backend import PostgreSQLBackend
134
+ return PostgreSQLBackend(handle, sample_size=sample_size)
135
+
136
+ if scheme in ("mssql", "sqlserver"):
137
+ from .backends.sqlserver_backend import SqlServerBackend
138
+ return SqlServerBackend(handle, sample_size=sample_size)
139
+
140
+ # Default to DuckDB for files (parquet, csv, etc.)
141
+ from .backends.duckdb_backend import DuckDBBackend
142
+ return DuckDBBackend(handle, sample_size=sample_size)
143
+
144
+
145
+ def _is_numeric(dtype: str) -> bool:
146
+ return dtype in ("int", "float")
147
+
148
+
149
+ def _is_string(dtype: str) -> bool:
150
+ return dtype == "string"
151
+
152
+
153
+ def _is_temporal(dtype: str) -> bool:
154
+ return dtype in ("date", "datetime", "time")
155
+
156
+
157
+ class ScoutProfiler:
158
+ """
159
+ Contract-free data profiler with pluggable backends.
160
+
161
+ Supports:
162
+ - Parquet and CSV files (local + S3) via DuckDB backend
163
+ - PostgreSQL tables via PostgreSQL backend
164
+
165
+ Efficiency features:
166
+ - Parquet metadata extraction (row count, schema) without data scan
167
+ - PostgreSQL pg_stats for lite preset
168
+ - Single-pass aggregation queries
169
+ - Preset modes (lite/standard/deep) for different use cases
170
+ - Smart sampling for large datasets
171
+
172
+ Usage:
173
+ # Quick overview
174
+ profiler = ScoutProfiler("data.parquet", preset="lite")
175
+
176
+ # Full analysis
177
+ profiler = ScoutProfiler("data.parquet", preset="deep", include_patterns=True)
178
+
179
+ # PostgreSQL table
180
+ profiler = ScoutProfiler("postgres://user:pass@host/db/public.users")
181
+
182
+ profile = profiler.profile()
183
+ print(profile.to_dict())
184
+ """
185
+
186
+ def __init__(
187
+ self,
188
+ source_uri: str,
189
+ *,
190
+ preset: Literal["lite", "standard", "deep"] = "standard",
191
+ list_values_threshold: Optional[int] = None,
192
+ top_n: Optional[int] = None,
193
+ sample_size: Optional[int] = None,
194
+ include_patterns: bool = False,
195
+ percentiles: Optional[List[int]] = None,
196
+ columns: Optional[List[str]] = None,
197
+ storage_options: Optional[Dict[str, Any]] = None,
198
+ ):
199
+ """
200
+ Initialize the profiler.
201
+
202
+ Args:
203
+ source_uri: Path or URI to the dataset (local, s3://, postgres://)
204
+ preset: Profiling depth preset ("lite", "standard", "deep")
205
+ list_values_threshold: List all values if distinct count <= this (overrides preset)
206
+ top_n: Number of top frequent values to include (overrides preset)
207
+ sample_size: If set, sample this many rows for profiling
208
+ include_patterns: Whether to detect patterns (email, uuid, etc.)
209
+ percentiles: List of percentiles to compute (overrides preset)
210
+ columns: Specific columns to profile (default: all)
211
+ storage_options: Cloud storage credentials (S3, Azure, GCS).
212
+ For S3/MinIO: aws_access_key_id, aws_secret_access_key, aws_region, endpoint_url
213
+ For Azure: account_name, account_key, sas_token, etc.
214
+ These override environment variables when provided.
215
+ """
216
+ self.source_uri = source_uri
217
+ self.handle = DatasetHandle.from_uri(source_uri, storage_options=storage_options)
218
+ self.sample_size = sample_size
219
+ self.include_patterns = include_patterns
220
+ self.columns_filter = columns
221
+
222
+ # Apply preset, then override with explicit args
223
+ if preset not in PRESETS:
224
+ valid_presets = ["scout", "scan", "interrogate"]
225
+ raise ValueError(
226
+ f"Invalid preset '{preset}'. Valid presets: {', '.join(valid_presets)}"
227
+ )
228
+ preset_config = PRESETS[preset]
229
+ self.list_values_threshold = (
230
+ list_values_threshold
231
+ if list_values_threshold is not None
232
+ else preset_config["list_values_threshold"]
233
+ )
234
+ self.top_n = top_n if top_n is not None else preset_config["top_n"]
235
+ self.include_numeric_stats = preset_config["include_numeric_stats"]
236
+ self.include_string_stats = preset_config["include_string_stats"]
237
+ self.include_temporal_stats = preset_config["include_temporal_stats"]
238
+ self.include_top_values = preset_config["include_top_values"]
239
+ self.include_percentiles = preset_config["include_percentiles"]
240
+
241
+ # Percentiles (only used if include_percentiles is True)
242
+ self.percentiles = percentiles or [25, 50, 75, 99]
243
+
244
+ # Metadata-only mode (for lite preset)
245
+ self.metadata_only = preset_config.get("metadata_only", False)
246
+
247
+ # Strategic standard mode (for standard preset on PostgreSQL)
248
+ self.strategic_standard = preset_config.get("strategic_standard", False)
249
+
250
+ # Backend is created on profile() call
251
+ self.backend = None
252
+
253
+ def profile(self) -> DatasetProfile:
254
+ """Execute profiling and return structured results."""
255
+ t0 = time.perf_counter()
256
+
257
+ # Create backend
258
+ self.backend = _select_backend(self.handle, sample_size=self.sample_size)
259
+
260
+ try:
261
+ # Connect to data source
262
+ self.backend.connect()
263
+
264
+ # 1. Get schema (column names and types)
265
+ schema = self.backend.get_schema()
266
+
267
+ # Filter columns if specified
268
+ if self.columns_filter:
269
+ schema = [(n, t) for n, t in schema if n in self.columns_filter]
270
+
271
+ # 2. Get row count (backend handles optimization)
272
+ row_count = self.backend.get_row_count()
273
+
274
+ # 3. Get estimated size (if available)
275
+ estimated_size = self.backend.get_estimated_size_bytes()
276
+
277
+ # 4. Profile each column (single-pass aggregation)
278
+ column_profiles = self._profile_columns(schema, row_count)
279
+
280
+ # 5. Optionally detect patterns (sampling-based, efficient)
281
+ if self.include_patterns:
282
+ self._detect_patterns(column_profiles)
283
+
284
+ # 6. Infer semantic types
285
+ self._infer_semantic_types(column_profiles)
286
+
287
+ duration_ms = int((time.perf_counter() - t0) * 1000)
288
+
289
+ return DatasetProfile(
290
+ source_uri=self.source_uri,
291
+ source_format=self.backend.source_format,
292
+ profiled_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
293
+ engine_version=VERSION,
294
+ row_count=row_count,
295
+ column_count=len(column_profiles),
296
+ estimated_size_bytes=estimated_size,
297
+ sampled=self.sample_size is not None,
298
+ sample_size=self.sample_size,
299
+ columns=column_profiles,
300
+ profile_duration_ms=duration_ms,
301
+ )
302
+ finally:
303
+ if self.backend:
304
+ self.backend.close()
305
+
306
+ def _profile_columns(
307
+ self, schema: List[Tuple[str, str]], row_count: int
308
+ ) -> List[ColumnProfile]:
309
+ """Build single compound query for all column statistics."""
310
+ if not schema:
311
+ return []
312
+
313
+ # Check if we can use metadata-only path (faster, no table scan)
314
+ use_metadata_only = (
315
+ self.metadata_only
316
+ and hasattr(self.backend, "supports_metadata_only")
317
+ and self.backend.supports_metadata_only()
318
+ )
319
+
320
+ if use_metadata_only:
321
+ return self._profile_columns_from_metadata(schema, row_count)
322
+
323
+ # Check if we can use strategic standard path (PostgreSQL optimization)
324
+ use_strategic_standard = (
325
+ self.strategic_standard
326
+ and hasattr(self.backend, "supports_strategic_standard")
327
+ and self.backend.supports_strategic_standard()
328
+ )
329
+
330
+ if use_strategic_standard:
331
+ return self._profile_columns_strategic(schema, row_count)
332
+
333
+ # Build aggregation expressions for each column
334
+ exprs: List[str] = []
335
+ col_info: List[Tuple[str, str, str]] = [] # (name, raw_type, normalized_type)
336
+
337
+ for col_name, raw_type in schema:
338
+ dtype = normalize_dtype(raw_type)
339
+ col_info.append((col_name, raw_type, dtype))
340
+ col_exprs = self._build_column_agg_exprs(col_name, dtype)
341
+ exprs.extend(col_exprs)
342
+
343
+ # Execute single aggregate query via backend
344
+ results = self.backend.execute_stats_query(exprs)
345
+
346
+ # Build ColumnProfile objects
347
+ profiles: List[ColumnProfile] = []
348
+ for col_name, raw_type, dtype in col_info:
349
+ profile = self._build_column_profile(
350
+ col_name, raw_type, dtype, results, row_count
351
+ )
352
+ profiles.append(profile)
353
+
354
+ # Fetch top values and low-cardinality values
355
+ for profile in profiles:
356
+ self._fetch_top_values(profile, row_count)
357
+ if profile.distinct_count <= self.list_values_threshold:
358
+ self._fetch_all_values(profile)
359
+
360
+ return profiles
361
+
362
+ def _profile_columns_from_metadata(
363
+ self, schema: List[Tuple[str, str]], row_count: int
364
+ ) -> List[ColumnProfile]:
365
+ """
366
+ Profile columns using metadata only (no table scan).
367
+
368
+ Used for 'lite' preset when backend supports it (PostgreSQL pg_stats, Parquet footer).
369
+ Returns estimates, not exact counts.
370
+ """
371
+ # Get metadata from backend
372
+ metadata = self.backend.profile_metadata_only(schema, row_count)
373
+
374
+ profiles: List[ColumnProfile] = []
375
+ for col_name, raw_type in schema:
376
+ dtype = normalize_dtype(raw_type)
377
+ col_meta = metadata.get(col_name, {})
378
+
379
+ null_count = col_meta.get("null_count", 0)
380
+ distinct_count = col_meta.get("distinct_count", 0)
381
+
382
+ non_null_count = row_count - null_count
383
+ null_rate = null_count / row_count if row_count > 0 else 0.0
384
+ uniqueness_ratio = (
385
+ distinct_count / non_null_count if non_null_count > 0 else 0.0
386
+ )
387
+
388
+ profile = ColumnProfile(
389
+ name=col_name,
390
+ dtype=dtype,
391
+ dtype_raw=raw_type,
392
+ row_count=row_count,
393
+ null_count=null_count,
394
+ null_rate=null_rate,
395
+ distinct_count=distinct_count,
396
+ uniqueness_ratio=uniqueness_ratio,
397
+ is_low_cardinality=distinct_count <= self.list_values_threshold,
398
+ )
399
+
400
+ # Use most_common_vals from pg_stats for low-cardinality columns
401
+ mcv = col_meta.get("most_common_vals")
402
+ if mcv and profile.is_low_cardinality:
403
+ profile.values = mcv
404
+
405
+ profiles.append(profile)
406
+
407
+ return profiles
408
+
409
+ def _profile_columns_strategic(
410
+ self, schema: List[Tuple[str, str]], row_count: int
411
+ ) -> List[ColumnProfile]:
412
+ """
413
+ Profile columns using strategic queries (PostgreSQL optimization).
414
+
415
+ This method optimizes standard preset for PostgreSQL by:
416
+ 1. Using metadata (pg_stats) for null/distinct counts
417
+ 2. Classifying columns by cardinality to choose optimal strategy
418
+ 3. Using TABLESAMPLE SYSTEM (not BERNOULLI) for numeric stats
419
+ 4. Batching low-cardinality GROUP BY queries
420
+ 5. Trusting pg_stats MCVs for high-cardinality columns
421
+
422
+ Much faster than full table scan approach.
423
+ """
424
+ import os
425
+
426
+ # Step 1: Get freshness info
427
+ freshness = self.backend.get_table_freshness()
428
+ is_fresh = freshness.get("is_fresh", False)
429
+
430
+ if os.getenv("KONTRA_VERBOSE"):
431
+ stale_ratio = freshness.get("stale_ratio", 1.0)
432
+ print(f"[INFO] PostgreSQL stats freshness: stale_ratio={stale_ratio:.2f}, is_fresh={is_fresh}")
433
+
434
+ # Step 2: Get metadata (null/distinct) and classify columns
435
+ metadata = self.backend.profile_metadata_only(schema, row_count)
436
+ classification = self.backend.classify_columns(schema, row_count)
437
+
438
+ # Step 3: Build profile objects with metadata
439
+ profiles: List[ColumnProfile] = []
440
+ numeric_cols = []
441
+ low_cardinality_cols = []
442
+
443
+ for col_name, raw_type in schema:
444
+ dtype = normalize_dtype(raw_type)
445
+ col_meta = metadata.get(col_name, {})
446
+ col_class = classification.get(col_name, {})
447
+
448
+ null_count = col_meta.get("null_count", 0)
449
+ distinct_count = col_meta.get("distinct_count", 0)
450
+
451
+ non_null_count = row_count - null_count
452
+ null_rate = null_count / row_count if row_count > 0 else 0.0
453
+ uniqueness_ratio = (
454
+ distinct_count / non_null_count if non_null_count > 0 else 0.0
455
+ )
456
+
457
+ profile = ColumnProfile(
458
+ name=col_name,
459
+ dtype=dtype,
460
+ dtype_raw=raw_type,
461
+ row_count=row_count,
462
+ null_count=null_count,
463
+ null_rate=null_rate,
464
+ distinct_count=distinct_count,
465
+ uniqueness_ratio=uniqueness_ratio,
466
+ is_low_cardinality=distinct_count <= self.list_values_threshold,
467
+ )
468
+
469
+ # Track columns needing additional queries
470
+ if _is_numeric(dtype) and self.include_numeric_stats:
471
+ numeric_cols.append((col_name, profile))
472
+
473
+ if col_class.get("strategy") == "group_by":
474
+ low_cardinality_cols.append(col_name)
475
+ elif col_class.get("strategy") == "metadata_only":
476
+ # Use MCVs from pg_stats for top_values
477
+ mcv = col_meta.get("most_common_vals")
478
+ if mcv and self.include_top_values:
479
+ profile.top_values = [
480
+ TopValue(value=v, count=0, pct=0.0)
481
+ for v in mcv[:self.top_n]
482
+ ]
483
+ if profile.is_low_cardinality:
484
+ profile.values = mcv
485
+
486
+ profiles.append(profile)
487
+
488
+ # Step 4: Numeric stats via TABLESAMPLE SYSTEM (fast block sampling)
489
+ if numeric_cols:
490
+ numeric_exprs = []
491
+ # SQL Server uses STDEV, PostgreSQL/DuckDB use STDDEV
492
+ is_duckdb = self.backend.source_format in ("parquet", "csv", "duckdb")
493
+ stddev_fn = "STDEV" if self.backend.source_format == "sqlserver" else "STDDEV"
494
+ for col_name, _ in numeric_cols:
495
+ c = self.backend.esc_ident(col_name)
496
+ # DuckDB: Filter out infinity values to prevent overflow errors
497
+ if is_duckdb:
498
+ finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
499
+ numeric_exprs.extend([
500
+ f"MIN({finite_col}) AS {self.backend.esc_ident(f'__min__{col_name}')}",
501
+ f"MAX({finite_col}) AS {self.backend.esc_ident(f'__max__{col_name}')}",
502
+ f"AVG({finite_col}) AS {self.backend.esc_ident(f'__mean__{col_name}')}",
503
+ f"{stddev_fn}({finite_col}) AS {self.backend.esc_ident(f'__std__{col_name}')}",
504
+ ])
505
+ else:
506
+ numeric_exprs.extend([
507
+ f"MIN({c}) AS {self.backend.esc_ident(f'__min__{col_name}')}",
508
+ f"MAX({c}) AS {self.backend.esc_ident(f'__max__{col_name}')}",
509
+ f"AVG({c}) AS {self.backend.esc_ident(f'__mean__{col_name}')}",
510
+ f"{stddev_fn}({c}) AS {self.backend.esc_ident(f'__std__{col_name}')}",
511
+ ])
512
+
513
+ # Use SYSTEM sampling (block-level) - much faster than BERNOULLI
514
+ # If stats are fresh, use smaller sample; if stale, use larger sample
515
+ sample_pct = 1.0 if is_fresh else 5.0
516
+ numeric_results = self.backend.execute_sampled_stats_query(
517
+ numeric_exprs, sample_pct=sample_pct
518
+ )
519
+
520
+ # Populate numeric stats
521
+ for col_name, profile in numeric_cols:
522
+ profile.numeric = NumericStats(
523
+ min=self._to_float(numeric_results.get(f"__min__{col_name}")),
524
+ max=self._to_float(numeric_results.get(f"__max__{col_name}")),
525
+ mean=self._to_float(numeric_results.get(f"__mean__{col_name}")),
526
+ std=self._to_float(numeric_results.get(f"__std__{col_name}")),
527
+ median=None, # Skip median in strategic mode (expensive)
528
+ percentiles={},
529
+ )
530
+
531
+ # Step 5: Low-cardinality columns via batched GROUP BY
532
+ if low_cardinality_cols and self.include_top_values:
533
+ low_card_values = self.backend.fetch_low_cardinality_values_batched(
534
+ low_cardinality_cols
535
+ )
536
+
537
+ # Populate values and top_values
538
+ for profile in profiles:
539
+ if profile.name in low_card_values:
540
+ values_with_counts = low_card_values[profile.name]
541
+ profile.values = [v for v, _ in values_with_counts]
542
+ profile.top_values = [
543
+ TopValue(
544
+ value=v,
545
+ count=c,
546
+ pct=(c / row_count * 100) if row_count > 0 else 0.0,
547
+ )
548
+ for v, c in values_with_counts[:self.top_n]
549
+ ]
550
+
551
+ # Step 6: Medium cardinality - sample top values
552
+ medium_card_cols = [
553
+ p.name for p in profiles
554
+ if classification.get(p.name, {}).get("strategy") == "sample"
555
+ and p.top_values is None
556
+ ]
557
+
558
+ if medium_card_cols and self.include_top_values:
559
+ for col_name in medium_card_cols:
560
+ profile = next(p for p in profiles if p.name == col_name)
561
+ try:
562
+ rows = self.backend.fetch_top_values(col_name, self.top_n)
563
+ profile.top_values = [
564
+ TopValue(
565
+ value=val,
566
+ count=int(cnt),
567
+ pct=(int(cnt) / row_count * 100) if row_count > 0 else 0.0,
568
+ )
569
+ for val, cnt in rows
570
+ ]
571
+ except Exception:
572
+ pass
573
+
574
+ return profiles
575
+
576
+ def _build_column_agg_exprs(self, col: str, dtype: str) -> List[str]:
577
+ """Generate SQL expressions for a single column's statistics."""
578
+ esc = self.backend.esc_ident
579
+ c = esc(col)
580
+ source_fmt = getattr(self.backend, "source_format", "")
581
+ is_sqlserver = source_fmt == "sqlserver"
582
+ is_duckdb = source_fmt in ("parquet", "csv", "duckdb")
583
+
584
+ # Core stats: always included (null count, distinct count)
585
+ exprs = [
586
+ f"COUNT(*) - COUNT({c}) AS {esc(f'__null__{col}')}",
587
+ f"COUNT(DISTINCT {c}) AS {esc(f'__distinct__{col}')}",
588
+ ]
589
+
590
+ # Numeric stats: controlled by preset
591
+ if _is_numeric(dtype) and self.include_numeric_stats:
592
+ # SQL Server: Cast to FLOAT to prevent overflow on large tables
593
+ avg_expr = f"AVG(CAST({c} AS FLOAT))" if is_sqlserver else f"AVG({c})"
594
+ # DuckDB: Filter out infinity values to prevent overflow errors
595
+ if is_duckdb:
596
+ finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
597
+ exprs.extend([
598
+ f"MIN({finite_col}) AS {esc(f'__min__{col}')}",
599
+ f"MAX({finite_col}) AS {esc(f'__max__{col}')}",
600
+ f"AVG({finite_col}) AS {esc(f'__mean__{col}')}",
601
+ ])
602
+ else:
603
+ exprs.extend([
604
+ f"MIN({c}) AS {esc(f'__min__{col}')}",
605
+ f"MAX({c}) AS {esc(f'__max__{col}')}",
606
+ f"{avg_expr} AS {esc(f'__mean__{col}')}",
607
+ ])
608
+ # SQL Server requires different PERCENTILE_CONT syntax (window function)
609
+ # Skip median/percentiles for SQL Server - use STDEV instead of STDDEV
610
+ if is_sqlserver:
611
+ exprs.append(f"STDEV({c}) AS {esc(f'__std__{col}')}")
612
+ elif is_duckdb:
613
+ finite_col = f"CASE WHEN ISFINITE({c}) THEN {c} END"
614
+ exprs.extend([
615
+ f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {finite_col}) AS {esc(f'__median__{col}')}",
616
+ f"STDDEV({finite_col}) AS {esc(f'__std__{col}')}",
617
+ ])
618
+ else:
619
+ exprs.extend([
620
+ f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {c}) AS {esc(f'__median__{col}')}",
621
+ f"STDDEV({c}) AS {esc(f'__std__{col}')}",
622
+ ])
623
+ # Additional percentiles: expensive, only in deep preset
624
+ if self.include_percentiles:
625
+ for p in self.percentiles:
626
+ if p != 50: # 50th is already the median
627
+ exprs.append(
628
+ f"PERCENTILE_CONT({p / 100}) WITHIN GROUP (ORDER BY {c}) "
629
+ f"AS {esc(f'__p{p}__{col}')}"
630
+ )
631
+
632
+ # String stats: controlled by preset
633
+ if _is_string(dtype) and self.include_string_stats:
634
+ # SQL Server uses LEN(), others use LENGTH()
635
+ len_fn = "LEN" if is_sqlserver else "LENGTH"
636
+ # SQL Server needs BIGINT cast to prevent overflow on large tables
637
+ sum_cast = "CAST(1 AS BIGINT)" if is_sqlserver else "1"
638
+ exprs.extend([
639
+ f"MIN({len_fn}({c})) AS {esc(f'__minlen__{col}')}",
640
+ f"MAX({len_fn}({c})) AS {esc(f'__maxlen__{col}')}",
641
+ f"AVG(CAST({len_fn}({c}) AS FLOAT)) AS {esc(f'__avglen__{col}')}",
642
+ f"SUM(CASE WHEN {c} = '' THEN {sum_cast} ELSE 0 END) AS {esc(f'__empty__{col}')}",
643
+ ])
644
+
645
+ # Temporal stats: controlled by preset
646
+ if _is_temporal(dtype) and self.include_temporal_stats:
647
+ exprs.extend([
648
+ f"MIN({c}) AS {esc(f'__datemin__{col}')}",
649
+ f"MAX({c}) AS {esc(f'__datemax__{col}')}",
650
+ ])
651
+
652
+ return exprs
653
+
654
+ def _build_column_profile(
655
+ self,
656
+ col_name: str,
657
+ raw_type: str,
658
+ dtype: str,
659
+ results: Dict[str, Any],
660
+ row_count: int,
661
+ ) -> ColumnProfile:
662
+ """Build a ColumnProfile from aggregation results."""
663
+ null_count = int(results.get(f"__null__{col_name}", 0) or 0)
664
+ distinct_count = int(results.get(f"__distinct__{col_name}", 0) or 0)
665
+
666
+ non_null_count = row_count - null_count
667
+ null_rate = null_count / row_count if row_count > 0 else 0.0
668
+ uniqueness_ratio = (
669
+ distinct_count / non_null_count if non_null_count > 0 else 0.0
670
+ )
671
+
672
+ profile = ColumnProfile(
673
+ name=col_name,
674
+ dtype=dtype,
675
+ dtype_raw=raw_type,
676
+ row_count=row_count,
677
+ null_count=null_count,
678
+ null_rate=null_rate,
679
+ distinct_count=distinct_count,
680
+ uniqueness_ratio=uniqueness_ratio,
681
+ is_low_cardinality=distinct_count <= self.list_values_threshold,
682
+ )
683
+
684
+ # Add type-specific stats (only if included by preset)
685
+ if _is_numeric(dtype) and self.include_numeric_stats:
686
+ percentiles = {}
687
+ if self.include_percentiles:
688
+ for p in self.percentiles:
689
+ val = results.get(f"__p{p}__{col_name}")
690
+ if val is not None:
691
+ percentiles[f"p{p}"] = float(val)
692
+
693
+ profile.numeric = NumericStats(
694
+ min=self._to_float(results.get(f"__min__{col_name}")),
695
+ max=self._to_float(results.get(f"__max__{col_name}")),
696
+ mean=self._to_float(results.get(f"__mean__{col_name}")),
697
+ median=self._to_float(results.get(f"__median__{col_name}")),
698
+ std=self._to_float(results.get(f"__std__{col_name}")),
699
+ percentiles=percentiles,
700
+ )
701
+
702
+ if _is_string(dtype) and self.include_string_stats:
703
+ profile.string = StringStats(
704
+ min_length=self._to_int(results.get(f"__minlen__{col_name}")),
705
+ max_length=self._to_int(results.get(f"__maxlen__{col_name}")),
706
+ avg_length=self._to_float(results.get(f"__avglen__{col_name}")),
707
+ empty_count=self._to_int(results.get(f"__empty__{col_name}")) or 0,
708
+ )
709
+
710
+ if _is_temporal(dtype) and self.include_temporal_stats:
711
+ date_min = results.get(f"__datemin__{col_name}")
712
+ date_max = results.get(f"__datemax__{col_name}")
713
+ profile.temporal = TemporalStats(
714
+ date_min=str(date_min) if date_min else None,
715
+ date_max=str(date_max) if date_max else None,
716
+ )
717
+
718
+ return profile
719
+
720
+ def _fetch_top_values(self, profile: ColumnProfile, row_count: int) -> None:
721
+ """Fetch top N most frequent values for a column."""
722
+ # Skip if top values not requested or top_n is 0
723
+ if not self.include_top_values or self.top_n <= 0:
724
+ return
725
+ if row_count == 0:
726
+ return
727
+
728
+ try:
729
+ rows = self.backend.fetch_top_values(profile.name, self.top_n)
730
+ profile.top_values = [
731
+ TopValue(
732
+ value=val,
733
+ count=int(cnt),
734
+ pct=(int(cnt) / row_count * 100) if row_count > 0 else 0.0,
735
+ )
736
+ for val, cnt in rows
737
+ ]
738
+ except Exception:
739
+ # Some types may not be groupable
740
+ pass
741
+
742
+ def _fetch_all_values(self, profile: ColumnProfile) -> None:
743
+ """Fetch all distinct values for low-cardinality columns."""
744
+ try:
745
+ profile.values = self.backend.fetch_distinct_values(profile.name)
746
+ except Exception:
747
+ # Some types may not be sortable
748
+ pass
749
+
750
+ def _detect_patterns(self, profiles: List[ColumnProfile]) -> None:
751
+ """Detect common patterns in string columns."""
752
+ from .patterns import detect_patterns
753
+
754
+ for profile in profiles:
755
+ if profile.dtype != "string" or profile.distinct_count == 0:
756
+ continue
757
+
758
+ try:
759
+ sample = self.backend.fetch_sample_values(profile.name, 100)
760
+ sample = [str(v) for v in sample if v is not None]
761
+ if sample:
762
+ profile.detected_patterns = detect_patterns(sample)
763
+ except Exception:
764
+ pass
765
+
766
+ def _infer_semantic_types(self, profiles: List[ColumnProfile]) -> None:
767
+ """Infer semantic type for each column based on profile data."""
768
+ for profile in profiles:
769
+ # Primary key / identifier candidate
770
+ if profile.uniqueness_ratio >= 0.99 and profile.null_rate == 0:
771
+ profile.semantic_type = "identifier"
772
+ # Category (low cardinality, non-numeric)
773
+ elif profile.is_low_cardinality and profile.dtype == "string":
774
+ profile.semantic_type = "category"
775
+ # Measure (numeric, non-low-cardinality)
776
+ elif profile.dtype in ("int", "float") and not profile.is_low_cardinality:
777
+ profile.semantic_type = "measure"
778
+ # Timestamp
779
+ elif profile.dtype in ("date", "datetime"):
780
+ profile.semantic_type = "timestamp"
781
+ # Boolean as category
782
+ elif profile.dtype == "bool":
783
+ profile.semantic_type = "category"
784
+
785
+ @staticmethod
786
+ def _to_float(val: Any) -> Optional[float]:
787
+ if val is None:
788
+ return None
789
+ try:
790
+ return float(val)
791
+ except (TypeError, ValueError):
792
+ return None
793
+
794
+ @staticmethod
795
+ def _to_int(val: Any) -> Optional[int]:
796
+ if val is None:
797
+ return None
798
+ try:
799
+ return int(val)
800
+ except (TypeError, ValueError):
801
+ return None