kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,577 @@
1
+ # src/kontra/scout/backends/sqlserver_backend.py
2
+ """
3
+ SQL Server backend for Scout profiler.
4
+
5
+ Uses system metadata views for efficient profiling.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from kontra.connectors.handle import DatasetHandle
14
+ from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
15
+ from kontra.scout.dtype_mapping import normalize_dtype
16
+
17
+
18
+ class SqlServerBackend:
19
+ """
20
+ SQL Server-based profiler backend.
21
+
22
+ Features:
23
+ - Uses sys.dm_db_partition_stats for row count estimates
24
+ - SQL aggregation for profiling
25
+ - Dialect-aware SQL (PERCENTILE_CONT instead of MEDIAN)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ handle: DatasetHandle,
31
+ *,
32
+ sample_size: Optional[int] = None,
33
+ ):
34
+ if not handle.db_params:
35
+ raise ValueError("SQL Server handle missing db_params")
36
+
37
+ self.handle = handle
38
+ self.params: SqlServerConnectionParams = handle.db_params
39
+ self.sample_size = sample_size
40
+ self._conn = None
41
+ self._schema: Optional[List[Tuple[str, str]]] = None
42
+
43
+ def connect(self) -> None:
44
+ """Establish connection to SQL Server."""
45
+ self._conn = get_connection(self.params)
46
+
47
+ def close(self) -> None:
48
+ """Close the connection."""
49
+ if self._conn:
50
+ self._conn.close()
51
+ self._conn = None
52
+
53
+ def get_schema(self) -> List[Tuple[str, str]]:
54
+ """Return [(column_name, raw_type), ...]"""
55
+ if self._schema is not None:
56
+ return self._schema
57
+
58
+ cursor = self._conn.cursor()
59
+ cursor.execute(
60
+ """
61
+ SELECT column_name, data_type
62
+ FROM information_schema.columns
63
+ WHERE table_schema = %s AND table_name = %s
64
+ ORDER BY ordinal_position
65
+ """,
66
+ (self.params.schema, self.params.table),
67
+ )
68
+ self._schema = [(row[0], row[1]) for row in cursor.fetchall()]
69
+ return self._schema
70
+
71
+ def get_row_count(self) -> int:
72
+ """
73
+ Get row count.
74
+
75
+ For large tables, uses sys.dm_db_partition_stats estimate first (fast).
76
+ Falls back to COUNT(*) for accuracy.
77
+ """
78
+ cursor = self._conn.cursor()
79
+
80
+ # Try partition stats estimate first (instant, no scan)
81
+ cursor.execute(
82
+ """
83
+ SELECT SUM(row_count) AS row_estimate
84
+ FROM sys.dm_db_partition_stats ps
85
+ JOIN sys.objects o ON ps.object_id = o.object_id
86
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
87
+ WHERE s.name = %s AND o.name = %s AND ps.index_id IN (0, 1)
88
+ """,
89
+ (self.params.schema, self.params.table),
90
+ )
91
+ row = cursor.fetchone()
92
+ estimate = int(row[0]) if row and row[0] else 0
93
+
94
+ # If estimate is 0 or negative (stats not updated), use COUNT
95
+ if estimate <= 0:
96
+ cursor.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
97
+ row = cursor.fetchone()
98
+ return int(row[0]) if row else 0
99
+
100
+ # If sample_size is set, we need exact count for accuracy
101
+ if self.sample_size:
102
+ cursor.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
103
+ row = cursor.fetchone()
104
+ return int(row[0]) if row else 0
105
+
106
+ # Use estimate for large tables
107
+ if os.getenv("KONTRA_VERBOSE"):
108
+ print(f"[INFO] sys.dm_db_partition_stats estimate: {estimate} rows")
109
+ return estimate
110
+
111
+ def get_estimated_size_bytes(self) -> Optional[int]:
112
+ """Estimate size from sys.dm_db_partition_stats."""
113
+ try:
114
+ cursor = self._conn.cursor()
115
+ cursor.execute(
116
+ """
117
+ SELECT SUM(used_page_count) * 8 * 1024 AS size_bytes
118
+ FROM sys.dm_db_partition_stats ps
119
+ JOIN sys.objects o ON ps.object_id = o.object_id
120
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
121
+ WHERE s.name = %s AND o.name = %s
122
+ """,
123
+ (self.params.schema, self.params.table),
124
+ )
125
+ row = cursor.fetchone()
126
+ return int(row[0]) if row and row[0] else None
127
+ except Exception:
128
+ return None
129
+
130
+ def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
131
+ """Execute aggregation query."""
132
+ if not exprs:
133
+ return {}
134
+
135
+ # Build query with optional sampling
136
+ table = self._qualified_table()
137
+ if self.sample_size:
138
+ # SQL Server sampling: TABLESAMPLE ROWS
139
+ sql = f"""
140
+ SELECT {', '.join(exprs)}
141
+ FROM {table}
142
+ TABLESAMPLE ({self.sample_size} ROWS)
143
+ """
144
+ else:
145
+ sql = f"SELECT {', '.join(exprs)} FROM {table}"
146
+
147
+ cursor = self._conn.cursor()
148
+ cursor.execute(sql)
149
+ row = cursor.fetchone()
150
+ col_names = [desc[0] for desc in cursor.description]
151
+ return dict(zip(col_names, row)) if row else {}
152
+
153
+ def fetch_top_values(self, column: str, limit: int) -> List[Tuple[Any, int]]:
154
+ """Fetch top N most frequent values."""
155
+ col = self.esc_ident(column)
156
+ table = self._qualified_table()
157
+ sql = f"""
158
+ SELECT TOP {limit} {col} AS val, COUNT(*) AS cnt
159
+ FROM {table}
160
+ WHERE {col} IS NOT NULL
161
+ GROUP BY {col}
162
+ ORDER BY cnt DESC
163
+ """
164
+ try:
165
+ cursor = self._conn.cursor()
166
+ cursor.execute(sql)
167
+ return [(r[0], int(r[1])) for r in cursor.fetchall()]
168
+ except Exception:
169
+ return []
170
+
171
+ def fetch_distinct_values(self, column: str) -> List[Any]:
172
+ """Fetch all distinct values."""
173
+ col = self.esc_ident(column)
174
+ table = self._qualified_table()
175
+ sql = f"""
176
+ SELECT DISTINCT {col}
177
+ FROM {table}
178
+ WHERE {col} IS NOT NULL
179
+ ORDER BY {col}
180
+ """
181
+ try:
182
+ cursor = self._conn.cursor()
183
+ cursor.execute(sql)
184
+ return [r[0] for r in cursor.fetchall()]
185
+ except Exception:
186
+ return []
187
+
188
+ def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
189
+ """Fetch sample values."""
190
+ col = self.esc_ident(column)
191
+ table = self._qualified_table()
192
+ sql = f"""
193
+ SELECT TOP {limit} {col}
194
+ FROM {table}
195
+ WHERE {col} IS NOT NULL
196
+ """
197
+ try:
198
+ cursor = self._conn.cursor()
199
+ cursor.execute(sql)
200
+ return [r[0] for r in cursor.fetchall() if r[0] is not None]
201
+ except Exception:
202
+ return []
203
+
204
+ def esc_ident(self, name: str) -> str:
205
+ """Escape identifier for SQL Server."""
206
+ return "[" + name.replace("]", "]]") + "]"
207
+
208
+ @property
209
+ def source_format(self) -> str:
210
+ """Return source format."""
211
+ return "sqlserver"
212
+
213
+ # ----------------------------- Internal methods -----------------------------
214
+
215
+ def _qualified_table(self) -> str:
216
+ """Return schema.table identifier."""
217
+ return f"{self.esc_ident(self.params.schema)}.{self.esc_ident(self.params.table)}"
218
+
219
+ def _get_object_id(self) -> Optional[int]:
220
+ """Get the object_id for the table."""
221
+ cursor = self._conn.cursor()
222
+ cursor.execute(
223
+ """
224
+ SELECT o.object_id
225
+ FROM sys.objects o
226
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
227
+ WHERE s.name = %s AND o.name = %s
228
+ """,
229
+ (self.params.schema, self.params.table),
230
+ )
231
+ row = cursor.fetchone()
232
+ return int(row[0]) if row else None
233
+
234
+ def supports_metadata_only(self) -> bool:
235
+ """Check if this backend supports metadata-only profiling."""
236
+ return True
237
+
238
+ def profile_metadata_only(
239
+ self, schema: List[Tuple[str, str]], row_count: int
240
+ ) -> Dict[str, Dict[str, Any]]:
241
+ """
242
+ Profile columns using SQL Server metadata (minimal table access).
243
+
244
+ SQL Server doesn't store null_frac like PostgreSQL. We use:
245
+ - sys.dm_db_stats_histogram for distinct count estimates
246
+ - sys.columns for basic column info
247
+
248
+ Note: For null counts, we fall back to a sampled query since
249
+ SQL Server metadata doesn't include null statistics directly.
250
+ """
251
+ cursor = self._conn.cursor()
252
+ object_id = self._get_object_id()
253
+
254
+ if not object_id:
255
+ # Fallback: return empty metadata
256
+ return {col_name: {"null_count": 0, "distinct_count": 0} for col_name, _ in schema}
257
+
258
+ # Get stats for each column from sys.dm_db_stats_histogram
259
+ # This gives us distinct count estimates
260
+ stats_info: Dict[str, Dict[str, Any]] = {}
261
+
262
+ for col_name, raw_type in schema:
263
+ stats_info[col_name] = {
264
+ "null_count": 0,
265
+ "distinct_count": 0,
266
+ "is_estimate": True,
267
+ }
268
+
269
+ # Query column statistics
270
+ try:
271
+ cursor.execute(
272
+ """
273
+ SELECT
274
+ c.name AS column_name,
275
+ s.stats_id,
276
+ sp.rows,
277
+ sp.rows_sampled,
278
+ sp.modification_counter
279
+ FROM sys.stats s
280
+ JOIN sys.stats_columns sc ON s.object_id = sc.object_id AND s.stats_id = sc.stats_id
281
+ JOIN sys.columns c ON sc.object_id = c.object_id AND sc.column_id = c.column_id
282
+ CROSS APPLY sys.dm_db_stats_properties(s.object_id, s.stats_id) sp
283
+ WHERE s.object_id = %s AND sc.stats_column_id = 1
284
+ """,
285
+ (object_id,),
286
+ )
287
+ for row in cursor.fetchall():
288
+ col_name = row[0]
289
+ if col_name in stats_info:
290
+ stats_info[col_name]["rows"] = row[2]
291
+ stats_info[col_name]["rows_sampled"] = row[3]
292
+ except Exception:
293
+ pass
294
+
295
+ # Get distinct counts from histogram
296
+ try:
297
+ cursor.execute(
298
+ """
299
+ SELECT
300
+ c.name AS column_name,
301
+ SUM(h.distinct_range_rows) + COUNT(*) AS distinct_estimate
302
+ FROM sys.stats s
303
+ JOIN sys.stats_columns sc ON s.object_id = sc.object_id AND s.stats_id = sc.stats_id
304
+ JOIN sys.columns c ON sc.object_id = c.object_id AND sc.column_id = c.column_id
305
+ CROSS APPLY sys.dm_db_stats_histogram(s.object_id, s.stats_id) h
306
+ WHERE s.object_id = %s AND sc.stats_column_id = 1
307
+ GROUP BY c.name
308
+ """,
309
+ (object_id,),
310
+ )
311
+ for row in cursor.fetchall():
312
+ col_name = row[0]
313
+ if col_name in stats_info:
314
+ stats_info[col_name]["distinct_count"] = int(row[1]) if row[1] else 0
315
+ except Exception:
316
+ # dm_db_stats_histogram might not be available (requires SQL Server 2016 SP1 CU2+)
317
+ pass
318
+
319
+ # For null counts, use a sampled query (SQL Server doesn't store null stats)
320
+ # Use TABLESAMPLE for efficiency
321
+ try:
322
+ null_exprs = []
323
+ for col_name, _ in schema:
324
+ c = self.esc_ident(col_name)
325
+ null_exprs.append(
326
+ f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS [{col_name}_nulls]"
327
+ )
328
+
329
+ table = self._qualified_table()
330
+ # Sample 1% for null estimation
331
+ sql = f"""
332
+ SELECT {', '.join(null_exprs)}
333
+ FROM {table}
334
+ TABLESAMPLE (1 PERCENT)
335
+ """
336
+ cursor.execute(sql)
337
+ row = cursor.fetchone()
338
+
339
+ if row:
340
+ for i, (col_name, _) in enumerate(schema):
341
+ sample_nulls = row[i] or 0
342
+ # Extrapolate to full table (rough estimate)
343
+ stats_info[col_name]["null_count"] = int(sample_nulls * 100)
344
+ except Exception:
345
+ # TABLESAMPLE might fail on small tables, fall back to full count
346
+ try:
347
+ null_exprs = []
348
+ for col_name, _ in schema:
349
+ c = self.esc_ident(col_name)
350
+ null_exprs.append(
351
+ f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS [{col_name}_nulls]"
352
+ )
353
+ sql = f"SELECT {', '.join(null_exprs)} FROM {self._qualified_table()}"
354
+ cursor.execute(sql)
355
+ row = cursor.fetchone()
356
+ if row:
357
+ for i, (col_name, _) in enumerate(schema):
358
+ stats_info[col_name]["null_count"] = int(row[i] or 0)
359
+ stats_info[col_name]["is_estimate"] = False
360
+ except Exception:
361
+ pass
362
+
363
+ return stats_info
364
+
365
+ def get_table_freshness(self) -> Dict[str, Any]:
366
+ """
367
+ Get table statistics freshness from sys.dm_db_stats_properties.
368
+
369
+ Returns dict with:
370
+ - modification_counter: rows modified since last stats update
371
+ - rows: row count from stats
372
+ - last_updated: timestamp of last stats update
373
+ - stale_ratio: modification_counter / rows
374
+ - is_fresh: True if stale_ratio < 0.2
375
+ """
376
+ cursor = self._conn.cursor()
377
+ object_id = self._get_object_id()
378
+
379
+ if not object_id:
380
+ return {
381
+ "modification_counter": 0,
382
+ "rows": 0,
383
+ "last_updated": None,
384
+ "stale_ratio": 1.0,
385
+ "is_fresh": False,
386
+ }
387
+
388
+ try:
389
+ cursor.execute(
390
+ """
391
+ SELECT TOP 1
392
+ sp.last_updated,
393
+ sp.modification_counter,
394
+ sp.rows
395
+ FROM sys.stats s
396
+ CROSS APPLY sys.dm_db_stats_properties(s.object_id, s.stats_id) sp
397
+ WHERE s.object_id = %s
398
+ ORDER BY sp.last_updated DESC
399
+ """,
400
+ (object_id,),
401
+ )
402
+ row = cursor.fetchone()
403
+
404
+ if not row:
405
+ return {
406
+ "modification_counter": 0,
407
+ "rows": 0,
408
+ "last_updated": None,
409
+ "stale_ratio": 1.0,
410
+ "is_fresh": False,
411
+ }
412
+
413
+ last_updated = row[0]
414
+ modification_counter = row[1] or 0
415
+ rows = row[2] or 0
416
+
417
+ stale_ratio = modification_counter / max(rows, 1) if rows > 0 else 1.0
418
+
419
+ return {
420
+ "modification_counter": modification_counter,
421
+ "rows": rows,
422
+ "last_updated": last_updated,
423
+ "stale_ratio": stale_ratio,
424
+ "is_fresh": stale_ratio < 0.2,
425
+ }
426
+ except Exception:
427
+ return {
428
+ "modification_counter": 0,
429
+ "rows": 0,
430
+ "last_updated": None,
431
+ "stale_ratio": 1.0,
432
+ "is_fresh": False,
433
+ }
434
+
435
+ def supports_strategic_standard(self) -> bool:
436
+ """Check if this backend supports strategic standard profiling."""
437
+ return True
438
+
439
+ def execute_sampled_stats_query(
440
+ self, exprs: List[str], sample_pct: float = 1.0
441
+ ) -> Dict[str, Any]:
442
+ """
443
+ Execute aggregation query with TABLESAMPLE (block sampling).
444
+
445
+ SQL Server's TABLESAMPLE works at the page level, so for small tables
446
+ low percentages may return 0 rows. We fall back to full table scan
447
+ for tables under 10K rows or if sampling returns no data.
448
+
449
+ Args:
450
+ exprs: List of SQL expressions to compute
451
+ sample_pct: Percentage to sample (default 1%)
452
+
453
+ Returns:
454
+ Dict of expression alias -> value
455
+ """
456
+ if not exprs:
457
+ return {}
458
+
459
+ table = self._qualified_table()
460
+
461
+ # For small tables, TABLESAMPLE may return 0 rows at low percentages
462
+ # Use a minimum of 10% for tables under 10K rows
463
+ row_count = self.get_row_count()
464
+ if row_count < 10000:
465
+ # Skip sampling for small tables - just do full scan
466
+ return self.execute_stats_query(exprs)
467
+
468
+ sql = f"""
469
+ SELECT {', '.join(exprs)}
470
+ FROM {table}
471
+ TABLESAMPLE ({sample_pct} PERCENT)
472
+ """
473
+
474
+ try:
475
+ cursor = self._conn.cursor()
476
+ cursor.execute(sql)
477
+ row = cursor.fetchone()
478
+ col_names = [desc[0] for desc in cursor.description]
479
+ result = dict(zip(col_names, row)) if row else {}
480
+
481
+ # Check if we got data - if all values are None, fall back to full scan
482
+ if all(v is None for v in result.values()):
483
+ return self.execute_stats_query(exprs)
484
+
485
+ return result
486
+ except Exception:
487
+ # Fall back to full query if TABLESAMPLE fails
488
+ return self.execute_stats_query(exprs)
489
+
490
+ def fetch_low_cardinality_values_batched(
491
+ self, columns: List[str]
492
+ ) -> Dict[str, List[Tuple[Any, int]]]:
493
+ """
494
+ Fetch value distributions for multiple low-cardinality columns in one query.
495
+
496
+ Uses UNION ALL to batch multiple GROUP BY queries into a single round-trip.
497
+ """
498
+ if not columns:
499
+ return {}
500
+
501
+ table = self._qualified_table()
502
+ parts = []
503
+ for col in columns:
504
+ c = self.esc_ident(col)
505
+ parts.append(f"""
506
+ SELECT '{col}' AS col_name, CAST({c} AS NVARCHAR(MAX)) AS val, COUNT(*) AS cnt
507
+ FROM {table}
508
+ WHERE {c} IS NOT NULL
509
+ GROUP BY {c}
510
+ """)
511
+
512
+ sql = " UNION ALL ".join(parts) + " ORDER BY col_name, cnt DESC"
513
+
514
+ result: Dict[str, List[Tuple[Any, int]]] = {col: [] for col in columns}
515
+ try:
516
+ cursor = self._conn.cursor()
517
+ cursor.execute(sql)
518
+ for row in cursor.fetchall():
519
+ col_name, val, cnt = row
520
+ if col_name in result:
521
+ result[col_name].append((val, int(cnt)))
522
+ except Exception:
523
+ pass
524
+
525
+ return result
526
+
527
+ def classify_columns(
528
+ self, schema: List[Tuple[str, str]], row_count: int
529
+ ) -> Dict[str, Dict[str, Any]]:
530
+ """
531
+ Classify columns based on histogram metadata for strategic profiling.
532
+
533
+ Classification rules:
534
+ - low: distinct < 20 → fetch all via GROUP BY
535
+ - medium: distinct 20-10000 → sample for top values
536
+ - high: distinct > 10000 → trust histogram only
537
+ """
538
+ # First get metadata
539
+ metadata = self.profile_metadata_only(schema, row_count)
540
+
541
+ result = {}
542
+ for col_name, raw_type in schema:
543
+ col_meta = metadata.get(col_name, {})
544
+ distinct_count = col_meta.get("distinct_count", 0)
545
+
546
+ # If we don't have distinct count, estimate from row_count
547
+ if distinct_count == 0:
548
+ distinct_count = row_count # Assume high cardinality
549
+
550
+ # Classify cardinality
551
+ if distinct_count < 20:
552
+ cardinality = "low"
553
+ strategy = "group_by"
554
+ elif distinct_count <= 10000:
555
+ cardinality = "medium"
556
+ strategy = "sample"
557
+ else:
558
+ cardinality = "high"
559
+ strategy = "metadata_only"
560
+
561
+ result[col_name] = {
562
+ "cardinality": cardinality,
563
+ "distinct_count": distinct_count,
564
+ "strategy": strategy,
565
+ "dtype": normalize_dtype(raw_type),
566
+ }
567
+
568
+ return result
569
+
570
+
571
+ def normalize_sqlserver_type(raw_type: str) -> str:
572
+ """
573
+ Normalize a SQL Server type to a simplified type name.
574
+
575
+ This is an alias for the shared normalize_dtype function.
576
+ """
577
+ return normalize_dtype(raw_type)