kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,519 @@
1
+ # src/kontra/scout/backends/postgres_backend.py
2
+ """
3
+ PostgreSQL backend for Scout profiler.
4
+
5
+ Uses pg_stats for efficient metadata queries and standard SQL for profiling.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from kontra.connectors.handle import DatasetHandle
14
+ from kontra.connectors.postgres import PostgresConnectionParams, get_connection
15
+ from kontra.scout.dtype_mapping import normalize_dtype
16
+
17
+
18
+ class PostgreSQLBackend:
19
+ """
20
+ PostgreSQL-based profiler backend.
21
+
22
+ Features:
23
+ - Uses pg_stats for row count estimates (lite preset)
24
+ - SQL aggregation for profiling
25
+ - Dialect-aware SQL (PERCENTILE_CONT instead of MEDIAN)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ handle: DatasetHandle,
31
+ *,
32
+ sample_size: Optional[int] = None,
33
+ ):
34
+ if not handle.db_params:
35
+ raise ValueError("PostgreSQL handle missing db_params")
36
+
37
+ self.handle = handle
38
+ self.params: PostgresConnectionParams = handle.db_params
39
+ self.sample_size = sample_size
40
+ self._conn = None
41
+ self._pg_stats: Optional[Dict[str, Dict[str, Any]]] = None
42
+ self._schema: Optional[List[Tuple[str, str]]] = None
43
+
44
+ def connect(self) -> None:
45
+ """Establish connection to PostgreSQL."""
46
+ self._conn = get_connection(self.params)
47
+
48
+ def close(self) -> None:
49
+ """Close the connection."""
50
+ if self._conn:
51
+ self._conn.close()
52
+ self._conn = None
53
+
54
+ def get_schema(self) -> List[Tuple[str, str]]:
55
+ """Return [(column_name, raw_type), ...]"""
56
+ if self._schema is not None:
57
+ return self._schema
58
+
59
+ with self._conn.cursor() as cur:
60
+ cur.execute(
61
+ """
62
+ SELECT column_name, data_type
63
+ FROM information_schema.columns
64
+ WHERE table_schema = %s AND table_name = %s
65
+ ORDER BY ordinal_position
66
+ """,
67
+ (self.params.schema, self.params.table),
68
+ )
69
+ self._schema = [(row[0], row[1]) for row in cur.fetchall()]
70
+ return self._schema
71
+
72
+ def get_row_count(self) -> int:
73
+ """
74
+ Get row count.
75
+
76
+ For large tables, uses pg_class estimate first (fast).
77
+ Falls back to COUNT(*) for accuracy.
78
+ """
79
+ # Try pg_class estimate first (instant, no scan)
80
+ with self._conn.cursor() as cur:
81
+ cur.execute(
82
+ """
83
+ SELECT reltuples::bigint
84
+ FROM pg_class
85
+ WHERE relname = %s
86
+ AND relnamespace = %s::regnamespace
87
+ """,
88
+ (self.params.table, self.params.schema),
89
+ )
90
+ row = cur.fetchone()
91
+ estimate = row[0] if row else 0
92
+
93
+ # If estimate is 0 or negative (stats not updated), use COUNT
94
+ if estimate <= 0:
95
+ cur.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
96
+ row = cur.fetchone()
97
+ return int(row[0]) if row else 0
98
+
99
+ # If sample_size is set, we need exact count for accuracy
100
+ if self.sample_size:
101
+ cur.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
102
+ row = cur.fetchone()
103
+ return int(row[0]) if row else 0
104
+
105
+ # Use estimate for large tables
106
+ if os.getenv("KONTRA_VERBOSE"):
107
+ print(f"[INFO] pg_class estimate: {estimate} rows")
108
+ return int(estimate)
109
+
110
+ def get_estimated_size_bytes(self) -> Optional[int]:
111
+ """Estimate size from pg_class."""
112
+ try:
113
+ with self._conn.cursor() as cur:
114
+ cur.execute(
115
+ """
116
+ SELECT pg_total_relation_size(%s::regclass)
117
+ """,
118
+ (f"{self.params.schema}.{self.params.table}",),
119
+ )
120
+ row = cur.fetchone()
121
+ return int(row[0]) if row else None
122
+ except Exception:
123
+ return None
124
+
125
+ def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
126
+ """Execute aggregation query."""
127
+ if not exprs:
128
+ return {}
129
+
130
+ # Build query with optional sampling
131
+ table = self._qualified_table()
132
+ if self.sample_size:
133
+ # PostgreSQL sampling: TABLESAMPLE or random() limit
134
+ sql = f"""
135
+ SELECT {', '.join(exprs)}
136
+ FROM {table}
137
+ TABLESAMPLE BERNOULLI (
138
+ LEAST(100, {self.sample_size} * 100.0 / NULLIF(
139
+ (SELECT reltuples FROM pg_class WHERE relname = '{self.params.table}'
140
+ AND relnamespace = '{self.params.schema}'::regnamespace), 0
141
+ ))
142
+ )
143
+ """
144
+ else:
145
+ sql = f"SELECT {', '.join(exprs)} FROM {table}"
146
+
147
+ with self._conn.cursor() as cur:
148
+ cur.execute(sql)
149
+ row = cur.fetchone()
150
+ col_names = [desc[0] for desc in cur.description]
151
+ return dict(zip(col_names, row)) if row else {}
152
+
153
+ def fetch_top_values(self, column: str, limit: int) -> List[Tuple[Any, int]]:
154
+ """Fetch top N most frequent values."""
155
+ col = self.esc_ident(column)
156
+ table = self._qualified_table()
157
+ sql = f"""
158
+ SELECT {col} AS val, COUNT(*) AS cnt
159
+ FROM {table}
160
+ WHERE {col} IS NOT NULL
161
+ GROUP BY {col}
162
+ ORDER BY cnt DESC
163
+ LIMIT {limit}
164
+ """
165
+ try:
166
+ with self._conn.cursor() as cur:
167
+ cur.execute(sql)
168
+ return [(r[0], int(r[1])) for r in cur.fetchall()]
169
+ except Exception:
170
+ return []
171
+
172
+ def fetch_distinct_values(self, column: str) -> List[Any]:
173
+ """Fetch all distinct values."""
174
+ col = self.esc_ident(column)
175
+ table = self._qualified_table()
176
+ sql = f"""
177
+ SELECT DISTINCT {col}
178
+ FROM {table}
179
+ WHERE {col} IS NOT NULL
180
+ ORDER BY {col}
181
+ """
182
+ try:
183
+ with self._conn.cursor() as cur:
184
+ cur.execute(sql)
185
+ return [r[0] for r in cur.fetchall()]
186
+ except Exception:
187
+ return []
188
+
189
+ def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
190
+ """Fetch sample values."""
191
+ col = self.esc_ident(column)
192
+ table = self._qualified_table()
193
+ sql = f"""
194
+ SELECT {col}
195
+ FROM {table}
196
+ WHERE {col} IS NOT NULL
197
+ LIMIT {limit}
198
+ """
199
+ try:
200
+ with self._conn.cursor() as cur:
201
+ cur.execute(sql)
202
+ return [r[0] for r in cur.fetchall() if r[0] is not None]
203
+ except Exception:
204
+ return []
205
+
206
+ def esc_ident(self, name: str) -> str:
207
+ """Escape identifier for PostgreSQL."""
208
+ return '"' + name.replace('"', '""') + '"'
209
+
210
+ @property
211
+ def source_format(self) -> str:
212
+ """Return source format."""
213
+ return "postgres"
214
+
215
+ # ----------------------------- Internal methods -----------------------------
216
+
217
+ def _qualified_table(self) -> str:
218
+ """Return schema.table identifier."""
219
+ return f"{self.esc_ident(self.params.schema)}.{self.esc_ident(self.params.table)}"
220
+
221
+ def _get_pg_stats(self) -> Dict[str, Dict[str, Any]]:
222
+ """Fetch and cache pg_stats."""
223
+ if self._pg_stats is not None:
224
+ return self._pg_stats
225
+
226
+ with self._conn.cursor() as cur:
227
+ cur.execute(
228
+ """
229
+ SELECT attname, null_frac, n_distinct,
230
+ most_common_vals::text, most_common_freqs::text
231
+ FROM pg_stats
232
+ WHERE schemaname = %s AND tablename = %s
233
+ """,
234
+ (self.params.schema, self.params.table),
235
+ )
236
+ self._pg_stats = {}
237
+ for row in cur.fetchall():
238
+ self._pg_stats[row[0]] = {
239
+ "null_frac": row[1],
240
+ "n_distinct": row[2],
241
+ "most_common_vals": row[3],
242
+ "most_common_freqs": row[4],
243
+ }
244
+ return self._pg_stats
245
+
246
+ def supports_metadata_only(self) -> bool:
247
+ """Check if this backend supports metadata-only profiling."""
248
+ return True
249
+
250
+ def profile_metadata_only(self, schema: List[Tuple[str, str]], row_count: int) -> Dict[str, Dict[str, Any]]:
251
+ """
252
+ Profile columns using only pg_stats metadata (no table scan).
253
+
254
+ Returns dict mapping column_name -> {null_count, distinct_count, ...}
255
+
256
+ This is used for the 'lite' preset to achieve near-instant profiling.
257
+ Note: Values are estimates based on PostgreSQL statistics, not exact counts.
258
+ """
259
+ pg_stats = self._get_pg_stats()
260
+ result = {}
261
+
262
+ for col_name, raw_type in schema:
263
+ col_stats = pg_stats.get(col_name, {})
264
+
265
+ # null_frac is fraction of nulls (0.0 to 1.0)
266
+ null_frac = col_stats.get("null_frac", 0.0) or 0.0
267
+ null_count = int(row_count * null_frac)
268
+
269
+ # n_distinct interpretation:
270
+ # - Positive: exact count of distinct values
271
+ # - Negative: fraction of rows that are distinct (multiply by row_count)
272
+ # - 0 or missing: unknown
273
+ n_distinct = col_stats.get("n_distinct", 0) or 0
274
+ if n_distinct > 0:
275
+ distinct_count = int(n_distinct)
276
+ elif n_distinct < 0:
277
+ # Negative means fraction: -0.5 means 50% of rows are distinct
278
+ distinct_count = int(abs(n_distinct) * row_count)
279
+ else:
280
+ # Unknown - estimate from null_frac (non-null rows)
281
+ distinct_count = int(row_count * (1 - null_frac))
282
+
283
+ # Parse most_common_vals if available (for low cardinality detection)
284
+ mcv_raw = col_stats.get("most_common_vals")
285
+ most_common_vals = None
286
+ if mcv_raw:
287
+ # pg_stats returns array as text: {val1,val2,...}
288
+ try:
289
+ # Remove braces and split
290
+ if mcv_raw.startswith("{") and mcv_raw.endswith("}"):
291
+ vals = mcv_raw[1:-1].split(",")
292
+ most_common_vals = [v.strip().strip('"') for v in vals if v.strip()]
293
+ except Exception:
294
+ pass
295
+
296
+ result[col_name] = {
297
+ "null_count": null_count,
298
+ "distinct_count": distinct_count,
299
+ "null_frac": null_frac,
300
+ "n_distinct_raw": n_distinct,
301
+ "most_common_vals": most_common_vals,
302
+ "is_estimate": True, # Flag that these are estimates
303
+ }
304
+
305
+ return result
306
+
307
+
308
+ def get_table_freshness(self) -> Dict[str, Any]:
309
+ """
310
+ Get table statistics freshness from pg_stat_user_tables.
311
+
312
+ Returns dict with:
313
+ - n_live_tup: estimated live rows
314
+ - n_mod_since_analyze: rows modified since last ANALYZE
315
+ - last_analyze: timestamp of last manual ANALYZE
316
+ - last_autoanalyze: timestamp of last auto ANALYZE
317
+ - stale_ratio: n_mod_since_analyze / n_live_tup (0.0 = fresh, 1.0 = very stale)
318
+ - is_fresh: True if stale_ratio < 0.2
319
+ """
320
+ with self._conn.cursor() as cur:
321
+ cur.execute(
322
+ """
323
+ SELECT
324
+ n_live_tup,
325
+ n_mod_since_analyze,
326
+ last_analyze,
327
+ last_autoanalyze
328
+ FROM pg_stat_user_tables
329
+ WHERE schemaname = %s AND relname = %s
330
+ """,
331
+ (self.params.schema, self.params.table),
332
+ )
333
+ row = cur.fetchone()
334
+
335
+ if not row:
336
+ return {
337
+ "n_live_tup": 0,
338
+ "n_mod_since_analyze": 0,
339
+ "last_analyze": None,
340
+ "last_autoanalyze": None,
341
+ "stale_ratio": 1.0,
342
+ "is_fresh": False,
343
+ }
344
+
345
+ n_live_tup = row[0] or 0
346
+ n_mod_since_analyze = row[1] or 0
347
+ last_analyze = row[2]
348
+ last_autoanalyze = row[3]
349
+
350
+ # Calculate staleness ratio
351
+ stale_ratio = (
352
+ n_mod_since_analyze / max(n_live_tup, 1)
353
+ if n_live_tup > 0
354
+ else 1.0
355
+ )
356
+
357
+ return {
358
+ "n_live_tup": n_live_tup,
359
+ "n_mod_since_analyze": n_mod_since_analyze,
360
+ "last_analyze": last_analyze,
361
+ "last_autoanalyze": last_autoanalyze,
362
+ "stale_ratio": stale_ratio,
363
+ "is_fresh": stale_ratio < 0.2,
364
+ }
365
+
366
+ def supports_strategic_standard(self) -> bool:
367
+ """Check if this backend supports strategic standard profiling."""
368
+ return True
369
+
370
+ def execute_sampled_stats_query(
371
+ self, exprs: List[str], sample_pct: float = 1.0
372
+ ) -> Dict[str, Any]:
373
+ """
374
+ Execute aggregation query with TABLESAMPLE SYSTEM (block sampling).
375
+
376
+ Unlike BERNOULLI which scans the entire table, SYSTEM samples
377
+ at the block level - much faster for large tables.
378
+
379
+ Args:
380
+ exprs: List of SQL expressions to compute
381
+ sample_pct: Percentage of blocks to sample (default 1%)
382
+
383
+ Returns:
384
+ Dict of expression alias -> value
385
+ """
386
+ if not exprs:
387
+ return {}
388
+
389
+ table = self._qualified_table()
390
+ # SYSTEM samples blocks, not rows - much faster than BERNOULLI
391
+ sql = f"""
392
+ SELECT {', '.join(exprs)}
393
+ FROM {table}
394
+ TABLESAMPLE SYSTEM ({sample_pct})
395
+ """
396
+
397
+ try:
398
+ with self._conn.cursor() as cur:
399
+ cur.execute(sql)
400
+ row = cur.fetchone()
401
+ col_names = [desc[0] for desc in cur.description]
402
+ result = dict(zip(col_names, row)) if row else {}
403
+
404
+ # If TABLESAMPLE returned empty (all NULLs), fall back to full query
405
+ # This happens for small tables where 1% sampling returns 0 rows
406
+ if result and all(v is None for v in result.values()):
407
+ return self.execute_stats_query(exprs)
408
+
409
+ return result
410
+ except Exception:
411
+ # Fall back to full query if TABLESAMPLE fails
412
+ return self.execute_stats_query(exprs)
413
+
414
+ def fetch_low_cardinality_values_batched(
415
+ self, columns: List[str]
416
+ ) -> Dict[str, List[Tuple[Any, int]]]:
417
+ """
418
+ Fetch value distributions for multiple low-cardinality columns in one query.
419
+
420
+ Uses UNION ALL to batch multiple GROUP BY queries into a single round-trip.
421
+
422
+ Args:
423
+ columns: List of column names to profile
424
+
425
+ Returns:
426
+ Dict mapping column_name -> [(value, count), ...]
427
+ """
428
+ if not columns:
429
+ return {}
430
+
431
+ table = self._qualified_table()
432
+ parts = []
433
+ for col in columns:
434
+ c = self.esc_ident(col)
435
+ # Cast to text for uniformity, include column name for identification
436
+ parts.append(f"""
437
+ SELECT '{col}' AS col_name, {c}::text AS val, COUNT(*) AS cnt
438
+ FROM {table}
439
+ WHERE {c} IS NOT NULL
440
+ GROUP BY {c}
441
+ """)
442
+
443
+ sql = " UNION ALL ".join(parts) + " ORDER BY col_name, cnt DESC"
444
+
445
+ result: Dict[str, List[Tuple[Any, int]]] = {col: [] for col in columns}
446
+ try:
447
+ with self._conn.cursor() as cur:
448
+ cur.execute(sql)
449
+ for row in cur.fetchall():
450
+ col_name, val, cnt = row
451
+ if col_name in result:
452
+ result[col_name].append((val, int(cnt)))
453
+ except Exception:
454
+ pass
455
+
456
+ return result
457
+
458
+ def classify_columns(
459
+ self, schema: List[Tuple[str, str]], row_count: int
460
+ ) -> Dict[str, Dict[str, Any]]:
461
+ """
462
+ Classify columns based on pg_stats metadata for strategic profiling.
463
+
464
+ Returns dict mapping column_name -> {
465
+ "cardinality": "low" | "medium" | "high",
466
+ "n_distinct": raw n_distinct value,
467
+ "estimated_distinct": estimated distinct count,
468
+ "strategy": "group_by" | "sample" | "metadata_only"
469
+ }
470
+
471
+ Classification rules:
472
+ - low: n_distinct < 20 → fetch all via GROUP BY
473
+ - medium: n_distinct 20-10000 → sample for top values
474
+ - high: n_distinct > 10000 → trust metadata MCVs only
475
+ """
476
+ pg_stats = self._get_pg_stats()
477
+ result = {}
478
+
479
+ for col_name, raw_type in schema:
480
+ col_stats = pg_stats.get(col_name, {})
481
+ n_distinct = col_stats.get("n_distinct", 0) or 0
482
+
483
+ # Calculate estimated distinct count
484
+ if n_distinct > 0:
485
+ estimated_distinct = int(n_distinct)
486
+ elif n_distinct < 0:
487
+ estimated_distinct = int(abs(n_distinct) * row_count)
488
+ else:
489
+ estimated_distinct = row_count # Unknown, assume high
490
+
491
+ # Classify cardinality
492
+ if estimated_distinct < 20:
493
+ cardinality = "low"
494
+ strategy = "group_by"
495
+ elif estimated_distinct <= 10000:
496
+ cardinality = "medium"
497
+ strategy = "sample"
498
+ else:
499
+ cardinality = "high"
500
+ strategy = "metadata_only"
501
+
502
+ result[col_name] = {
503
+ "cardinality": cardinality,
504
+ "n_distinct": n_distinct,
505
+ "estimated_distinct": estimated_distinct,
506
+ "strategy": strategy,
507
+ "dtype": normalize_dtype(raw_type),
508
+ }
509
+
510
+ return result
511
+
512
+
513
+ def normalize_pg_type(raw_type: str) -> str:
514
+ """
515
+ Normalize a PostgreSQL type to a simplified type name.
516
+
517
+ This is an alias for the shared normalize_dtype function.
518
+ """
519
+ return normalize_dtype(raw_type)