kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
# src/kontra/scout/backends/sqlserver_backend.py
|
|
2
|
+
"""
|
|
3
|
+
SQL Server backend for Scout profiler.
|
|
4
|
+
|
|
5
|
+
Uses system metadata views for efficient profiling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from kontra.connectors.handle import DatasetHandle
|
|
14
|
+
from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
|
|
15
|
+
from kontra.scout.dtype_mapping import normalize_dtype
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SqlServerBackend:
|
|
19
|
+
"""
|
|
20
|
+
SQL Server-based profiler backend.
|
|
21
|
+
|
|
22
|
+
Features:
|
|
23
|
+
- Uses sys.dm_db_partition_stats for row count estimates
|
|
24
|
+
- SQL aggregation for profiling
|
|
25
|
+
- Dialect-aware SQL (PERCENTILE_CONT instead of MEDIAN)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
handle: DatasetHandle,
|
|
31
|
+
*,
|
|
32
|
+
sample_size: Optional[int] = None,
|
|
33
|
+
):
|
|
34
|
+
if not handle.db_params:
|
|
35
|
+
raise ValueError("SQL Server handle missing db_params")
|
|
36
|
+
|
|
37
|
+
self.handle = handle
|
|
38
|
+
self.params: SqlServerConnectionParams = handle.db_params
|
|
39
|
+
self.sample_size = sample_size
|
|
40
|
+
self._conn = None
|
|
41
|
+
self._schema: Optional[List[Tuple[str, str]]] = None
|
|
42
|
+
|
|
43
|
+
def connect(self) -> None:
|
|
44
|
+
"""Establish connection to SQL Server."""
|
|
45
|
+
self._conn = get_connection(self.params)
|
|
46
|
+
|
|
47
|
+
def close(self) -> None:
|
|
48
|
+
"""Close the connection."""
|
|
49
|
+
if self._conn:
|
|
50
|
+
self._conn.close()
|
|
51
|
+
self._conn = None
|
|
52
|
+
|
|
53
|
+
def get_schema(self) -> List[Tuple[str, str]]:
|
|
54
|
+
"""Return [(column_name, raw_type), ...]"""
|
|
55
|
+
if self._schema is not None:
|
|
56
|
+
return self._schema
|
|
57
|
+
|
|
58
|
+
cursor = self._conn.cursor()
|
|
59
|
+
cursor.execute(
|
|
60
|
+
"""
|
|
61
|
+
SELECT column_name, data_type
|
|
62
|
+
FROM information_schema.columns
|
|
63
|
+
WHERE table_schema = %s AND table_name = %s
|
|
64
|
+
ORDER BY ordinal_position
|
|
65
|
+
""",
|
|
66
|
+
(self.params.schema, self.params.table),
|
|
67
|
+
)
|
|
68
|
+
self._schema = [(row[0], row[1]) for row in cursor.fetchall()]
|
|
69
|
+
return self._schema
|
|
70
|
+
|
|
71
|
+
def get_row_count(self) -> int:
|
|
72
|
+
"""
|
|
73
|
+
Get row count.
|
|
74
|
+
|
|
75
|
+
For large tables, uses sys.dm_db_partition_stats estimate first (fast).
|
|
76
|
+
Falls back to COUNT(*) for accuracy.
|
|
77
|
+
"""
|
|
78
|
+
cursor = self._conn.cursor()
|
|
79
|
+
|
|
80
|
+
# Try partition stats estimate first (instant, no scan)
|
|
81
|
+
cursor.execute(
|
|
82
|
+
"""
|
|
83
|
+
SELECT SUM(row_count) AS row_estimate
|
|
84
|
+
FROM sys.dm_db_partition_stats ps
|
|
85
|
+
JOIN sys.objects o ON ps.object_id = o.object_id
|
|
86
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
87
|
+
WHERE s.name = %s AND o.name = %s AND ps.index_id IN (0, 1)
|
|
88
|
+
""",
|
|
89
|
+
(self.params.schema, self.params.table),
|
|
90
|
+
)
|
|
91
|
+
row = cursor.fetchone()
|
|
92
|
+
estimate = int(row[0]) if row and row[0] else 0
|
|
93
|
+
|
|
94
|
+
# If estimate is 0 or negative (stats not updated), use COUNT
|
|
95
|
+
if estimate <= 0:
|
|
96
|
+
cursor.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
|
|
97
|
+
row = cursor.fetchone()
|
|
98
|
+
return int(row[0]) if row else 0
|
|
99
|
+
|
|
100
|
+
# If sample_size is set, we need exact count for accuracy
|
|
101
|
+
if self.sample_size:
|
|
102
|
+
cursor.execute(f"SELECT COUNT(*) FROM {self._qualified_table()}")
|
|
103
|
+
row = cursor.fetchone()
|
|
104
|
+
return int(row[0]) if row else 0
|
|
105
|
+
|
|
106
|
+
# Use estimate for large tables
|
|
107
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
108
|
+
print(f"[INFO] sys.dm_db_partition_stats estimate: {estimate} rows")
|
|
109
|
+
return estimate
|
|
110
|
+
|
|
111
|
+
def get_estimated_size_bytes(self) -> Optional[int]:
|
|
112
|
+
"""Estimate size from sys.dm_db_partition_stats."""
|
|
113
|
+
try:
|
|
114
|
+
cursor = self._conn.cursor()
|
|
115
|
+
cursor.execute(
|
|
116
|
+
"""
|
|
117
|
+
SELECT SUM(used_page_count) * 8 * 1024 AS size_bytes
|
|
118
|
+
FROM sys.dm_db_partition_stats ps
|
|
119
|
+
JOIN sys.objects o ON ps.object_id = o.object_id
|
|
120
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
121
|
+
WHERE s.name = %s AND o.name = %s
|
|
122
|
+
""",
|
|
123
|
+
(self.params.schema, self.params.table),
|
|
124
|
+
)
|
|
125
|
+
row = cursor.fetchone()
|
|
126
|
+
return int(row[0]) if row and row[0] else None
|
|
127
|
+
except Exception:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
|
|
131
|
+
"""Execute aggregation query."""
|
|
132
|
+
if not exprs:
|
|
133
|
+
return {}
|
|
134
|
+
|
|
135
|
+
# Build query with optional sampling
|
|
136
|
+
table = self._qualified_table()
|
|
137
|
+
if self.sample_size:
|
|
138
|
+
# SQL Server sampling: TABLESAMPLE ROWS
|
|
139
|
+
sql = f"""
|
|
140
|
+
SELECT {', '.join(exprs)}
|
|
141
|
+
FROM {table}
|
|
142
|
+
TABLESAMPLE ({self.sample_size} ROWS)
|
|
143
|
+
"""
|
|
144
|
+
else:
|
|
145
|
+
sql = f"SELECT {', '.join(exprs)} FROM {table}"
|
|
146
|
+
|
|
147
|
+
cursor = self._conn.cursor()
|
|
148
|
+
cursor.execute(sql)
|
|
149
|
+
row = cursor.fetchone()
|
|
150
|
+
col_names = [desc[0] for desc in cursor.description]
|
|
151
|
+
return dict(zip(col_names, row)) if row else {}
|
|
152
|
+
|
|
153
|
+
def fetch_top_values(self, column: str, limit: int) -> List[Tuple[Any, int]]:
|
|
154
|
+
"""Fetch top N most frequent values."""
|
|
155
|
+
col = self.esc_ident(column)
|
|
156
|
+
table = self._qualified_table()
|
|
157
|
+
sql = f"""
|
|
158
|
+
SELECT TOP {limit} {col} AS val, COUNT(*) AS cnt
|
|
159
|
+
FROM {table}
|
|
160
|
+
WHERE {col} IS NOT NULL
|
|
161
|
+
GROUP BY {col}
|
|
162
|
+
ORDER BY cnt DESC
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
cursor = self._conn.cursor()
|
|
166
|
+
cursor.execute(sql)
|
|
167
|
+
return [(r[0], int(r[1])) for r in cursor.fetchall()]
|
|
168
|
+
except Exception:
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
def fetch_distinct_values(self, column: str) -> List[Any]:
|
|
172
|
+
"""Fetch all distinct values."""
|
|
173
|
+
col = self.esc_ident(column)
|
|
174
|
+
table = self._qualified_table()
|
|
175
|
+
sql = f"""
|
|
176
|
+
SELECT DISTINCT {col}
|
|
177
|
+
FROM {table}
|
|
178
|
+
WHERE {col} IS NOT NULL
|
|
179
|
+
ORDER BY {col}
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
cursor = self._conn.cursor()
|
|
183
|
+
cursor.execute(sql)
|
|
184
|
+
return [r[0] for r in cursor.fetchall()]
|
|
185
|
+
except Exception:
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
|
|
189
|
+
"""Fetch sample values."""
|
|
190
|
+
col = self.esc_ident(column)
|
|
191
|
+
table = self._qualified_table()
|
|
192
|
+
sql = f"""
|
|
193
|
+
SELECT TOP {limit} {col}
|
|
194
|
+
FROM {table}
|
|
195
|
+
WHERE {col} IS NOT NULL
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
cursor = self._conn.cursor()
|
|
199
|
+
cursor.execute(sql)
|
|
200
|
+
return [r[0] for r in cursor.fetchall() if r[0] is not None]
|
|
201
|
+
except Exception:
|
|
202
|
+
return []
|
|
203
|
+
|
|
204
|
+
def esc_ident(self, name: str) -> str:
|
|
205
|
+
"""Escape identifier for SQL Server."""
|
|
206
|
+
return "[" + name.replace("]", "]]") + "]"
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def source_format(self) -> str:
|
|
210
|
+
"""Return source format."""
|
|
211
|
+
return "sqlserver"
|
|
212
|
+
|
|
213
|
+
# ----------------------------- Internal methods -----------------------------
|
|
214
|
+
|
|
215
|
+
def _qualified_table(self) -> str:
|
|
216
|
+
"""Return schema.table identifier."""
|
|
217
|
+
return f"{self.esc_ident(self.params.schema)}.{self.esc_ident(self.params.table)}"
|
|
218
|
+
|
|
219
|
+
def _get_object_id(self) -> Optional[int]:
|
|
220
|
+
"""Get the object_id for the table."""
|
|
221
|
+
cursor = self._conn.cursor()
|
|
222
|
+
cursor.execute(
|
|
223
|
+
"""
|
|
224
|
+
SELECT o.object_id
|
|
225
|
+
FROM sys.objects o
|
|
226
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
227
|
+
WHERE s.name = %s AND o.name = %s
|
|
228
|
+
""",
|
|
229
|
+
(self.params.schema, self.params.table),
|
|
230
|
+
)
|
|
231
|
+
row = cursor.fetchone()
|
|
232
|
+
return int(row[0]) if row else None
|
|
233
|
+
|
|
234
|
+
def supports_metadata_only(self) -> bool:
|
|
235
|
+
"""Check if this backend supports metadata-only profiling."""
|
|
236
|
+
return True
|
|
237
|
+
|
|
238
|
+
def profile_metadata_only(
|
|
239
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
240
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
241
|
+
"""
|
|
242
|
+
Profile columns using SQL Server metadata (minimal table access).
|
|
243
|
+
|
|
244
|
+
SQL Server doesn't store null_frac like PostgreSQL. We use:
|
|
245
|
+
- sys.dm_db_stats_histogram for distinct count estimates
|
|
246
|
+
- sys.columns for basic column info
|
|
247
|
+
|
|
248
|
+
Note: For null counts, we fall back to a sampled query since
|
|
249
|
+
SQL Server metadata doesn't include null statistics directly.
|
|
250
|
+
"""
|
|
251
|
+
cursor = self._conn.cursor()
|
|
252
|
+
object_id = self._get_object_id()
|
|
253
|
+
|
|
254
|
+
if not object_id:
|
|
255
|
+
# Fallback: return empty metadata
|
|
256
|
+
return {col_name: {"null_count": 0, "distinct_count": 0} for col_name, _ in schema}
|
|
257
|
+
|
|
258
|
+
# Get stats for each column from sys.dm_db_stats_histogram
|
|
259
|
+
# This gives us distinct count estimates
|
|
260
|
+
stats_info: Dict[str, Dict[str, Any]] = {}
|
|
261
|
+
|
|
262
|
+
for col_name, raw_type in schema:
|
|
263
|
+
stats_info[col_name] = {
|
|
264
|
+
"null_count": 0,
|
|
265
|
+
"distinct_count": 0,
|
|
266
|
+
"is_estimate": True,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
# Query column statistics
|
|
270
|
+
try:
|
|
271
|
+
cursor.execute(
|
|
272
|
+
"""
|
|
273
|
+
SELECT
|
|
274
|
+
c.name AS column_name,
|
|
275
|
+
s.stats_id,
|
|
276
|
+
sp.rows,
|
|
277
|
+
sp.rows_sampled,
|
|
278
|
+
sp.modification_counter
|
|
279
|
+
FROM sys.stats s
|
|
280
|
+
JOIN sys.stats_columns sc ON s.object_id = sc.object_id AND s.stats_id = sc.stats_id
|
|
281
|
+
JOIN sys.columns c ON sc.object_id = c.object_id AND sc.column_id = c.column_id
|
|
282
|
+
CROSS APPLY sys.dm_db_stats_properties(s.object_id, s.stats_id) sp
|
|
283
|
+
WHERE s.object_id = %s AND sc.stats_column_id = 1
|
|
284
|
+
""",
|
|
285
|
+
(object_id,),
|
|
286
|
+
)
|
|
287
|
+
for row in cursor.fetchall():
|
|
288
|
+
col_name = row[0]
|
|
289
|
+
if col_name in stats_info:
|
|
290
|
+
stats_info[col_name]["rows"] = row[2]
|
|
291
|
+
stats_info[col_name]["rows_sampled"] = row[3]
|
|
292
|
+
except Exception:
|
|
293
|
+
pass
|
|
294
|
+
|
|
295
|
+
# Get distinct counts from histogram
|
|
296
|
+
try:
|
|
297
|
+
cursor.execute(
|
|
298
|
+
"""
|
|
299
|
+
SELECT
|
|
300
|
+
c.name AS column_name,
|
|
301
|
+
SUM(h.distinct_range_rows) + COUNT(*) AS distinct_estimate
|
|
302
|
+
FROM sys.stats s
|
|
303
|
+
JOIN sys.stats_columns sc ON s.object_id = sc.object_id AND s.stats_id = sc.stats_id
|
|
304
|
+
JOIN sys.columns c ON sc.object_id = c.object_id AND sc.column_id = c.column_id
|
|
305
|
+
CROSS APPLY sys.dm_db_stats_histogram(s.object_id, s.stats_id) h
|
|
306
|
+
WHERE s.object_id = %s AND sc.stats_column_id = 1
|
|
307
|
+
GROUP BY c.name
|
|
308
|
+
""",
|
|
309
|
+
(object_id,),
|
|
310
|
+
)
|
|
311
|
+
for row in cursor.fetchall():
|
|
312
|
+
col_name = row[0]
|
|
313
|
+
if col_name in stats_info:
|
|
314
|
+
stats_info[col_name]["distinct_count"] = int(row[1]) if row[1] else 0
|
|
315
|
+
except Exception:
|
|
316
|
+
# dm_db_stats_histogram might not be available (requires SQL Server 2016 SP1 CU2+)
|
|
317
|
+
pass
|
|
318
|
+
|
|
319
|
+
# For null counts, use a sampled query (SQL Server doesn't store null stats)
|
|
320
|
+
# Use TABLESAMPLE for efficiency
|
|
321
|
+
try:
|
|
322
|
+
null_exprs = []
|
|
323
|
+
for col_name, _ in schema:
|
|
324
|
+
c = self.esc_ident(col_name)
|
|
325
|
+
null_exprs.append(
|
|
326
|
+
f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS [{col_name}_nulls]"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
table = self._qualified_table()
|
|
330
|
+
# Sample 1% for null estimation
|
|
331
|
+
sql = f"""
|
|
332
|
+
SELECT {', '.join(null_exprs)}
|
|
333
|
+
FROM {table}
|
|
334
|
+
TABLESAMPLE (1 PERCENT)
|
|
335
|
+
"""
|
|
336
|
+
cursor.execute(sql)
|
|
337
|
+
row = cursor.fetchone()
|
|
338
|
+
|
|
339
|
+
if row:
|
|
340
|
+
for i, (col_name, _) in enumerate(schema):
|
|
341
|
+
sample_nulls = row[i] or 0
|
|
342
|
+
# Extrapolate to full table (rough estimate)
|
|
343
|
+
stats_info[col_name]["null_count"] = int(sample_nulls * 100)
|
|
344
|
+
except Exception:
|
|
345
|
+
# TABLESAMPLE might fail on small tables, fall back to full count
|
|
346
|
+
try:
|
|
347
|
+
null_exprs = []
|
|
348
|
+
for col_name, _ in schema:
|
|
349
|
+
c = self.esc_ident(col_name)
|
|
350
|
+
null_exprs.append(
|
|
351
|
+
f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS [{col_name}_nulls]"
|
|
352
|
+
)
|
|
353
|
+
sql = f"SELECT {', '.join(null_exprs)} FROM {self._qualified_table()}"
|
|
354
|
+
cursor.execute(sql)
|
|
355
|
+
row = cursor.fetchone()
|
|
356
|
+
if row:
|
|
357
|
+
for i, (col_name, _) in enumerate(schema):
|
|
358
|
+
stats_info[col_name]["null_count"] = int(row[i] or 0)
|
|
359
|
+
stats_info[col_name]["is_estimate"] = False
|
|
360
|
+
except Exception:
|
|
361
|
+
pass
|
|
362
|
+
|
|
363
|
+
return stats_info
|
|
364
|
+
|
|
365
|
+
def get_table_freshness(self) -> Dict[str, Any]:
|
|
366
|
+
"""
|
|
367
|
+
Get table statistics freshness from sys.dm_db_stats_properties.
|
|
368
|
+
|
|
369
|
+
Returns dict with:
|
|
370
|
+
- modification_counter: rows modified since last stats update
|
|
371
|
+
- rows: row count from stats
|
|
372
|
+
- last_updated: timestamp of last stats update
|
|
373
|
+
- stale_ratio: modification_counter / rows
|
|
374
|
+
- is_fresh: True if stale_ratio < 0.2
|
|
375
|
+
"""
|
|
376
|
+
cursor = self._conn.cursor()
|
|
377
|
+
object_id = self._get_object_id()
|
|
378
|
+
|
|
379
|
+
if not object_id:
|
|
380
|
+
return {
|
|
381
|
+
"modification_counter": 0,
|
|
382
|
+
"rows": 0,
|
|
383
|
+
"last_updated": None,
|
|
384
|
+
"stale_ratio": 1.0,
|
|
385
|
+
"is_fresh": False,
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
cursor.execute(
|
|
390
|
+
"""
|
|
391
|
+
SELECT TOP 1
|
|
392
|
+
sp.last_updated,
|
|
393
|
+
sp.modification_counter,
|
|
394
|
+
sp.rows
|
|
395
|
+
FROM sys.stats s
|
|
396
|
+
CROSS APPLY sys.dm_db_stats_properties(s.object_id, s.stats_id) sp
|
|
397
|
+
WHERE s.object_id = %s
|
|
398
|
+
ORDER BY sp.last_updated DESC
|
|
399
|
+
""",
|
|
400
|
+
(object_id,),
|
|
401
|
+
)
|
|
402
|
+
row = cursor.fetchone()
|
|
403
|
+
|
|
404
|
+
if not row:
|
|
405
|
+
return {
|
|
406
|
+
"modification_counter": 0,
|
|
407
|
+
"rows": 0,
|
|
408
|
+
"last_updated": None,
|
|
409
|
+
"stale_ratio": 1.0,
|
|
410
|
+
"is_fresh": False,
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
last_updated = row[0]
|
|
414
|
+
modification_counter = row[1] or 0
|
|
415
|
+
rows = row[2] or 0
|
|
416
|
+
|
|
417
|
+
stale_ratio = modification_counter / max(rows, 1) if rows > 0 else 1.0
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
"modification_counter": modification_counter,
|
|
421
|
+
"rows": rows,
|
|
422
|
+
"last_updated": last_updated,
|
|
423
|
+
"stale_ratio": stale_ratio,
|
|
424
|
+
"is_fresh": stale_ratio < 0.2,
|
|
425
|
+
}
|
|
426
|
+
except Exception:
|
|
427
|
+
return {
|
|
428
|
+
"modification_counter": 0,
|
|
429
|
+
"rows": 0,
|
|
430
|
+
"last_updated": None,
|
|
431
|
+
"stale_ratio": 1.0,
|
|
432
|
+
"is_fresh": False,
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
def supports_strategic_standard(self) -> bool:
|
|
436
|
+
"""Check if this backend supports strategic standard profiling."""
|
|
437
|
+
return True
|
|
438
|
+
|
|
439
|
+
def execute_sampled_stats_query(
|
|
440
|
+
self, exprs: List[str], sample_pct: float = 1.0
|
|
441
|
+
) -> Dict[str, Any]:
|
|
442
|
+
"""
|
|
443
|
+
Execute aggregation query with TABLESAMPLE (block sampling).
|
|
444
|
+
|
|
445
|
+
SQL Server's TABLESAMPLE works at the page level, so for small tables
|
|
446
|
+
low percentages may return 0 rows. We fall back to full table scan
|
|
447
|
+
for tables under 10K rows or if sampling returns no data.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
exprs: List of SQL expressions to compute
|
|
451
|
+
sample_pct: Percentage to sample (default 1%)
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
Dict of expression alias -> value
|
|
455
|
+
"""
|
|
456
|
+
if not exprs:
|
|
457
|
+
return {}
|
|
458
|
+
|
|
459
|
+
table = self._qualified_table()
|
|
460
|
+
|
|
461
|
+
# For small tables, TABLESAMPLE may return 0 rows at low percentages
|
|
462
|
+
# Use a minimum of 10% for tables under 10K rows
|
|
463
|
+
row_count = self.get_row_count()
|
|
464
|
+
if row_count < 10000:
|
|
465
|
+
# Skip sampling for small tables - just do full scan
|
|
466
|
+
return self.execute_stats_query(exprs)
|
|
467
|
+
|
|
468
|
+
sql = f"""
|
|
469
|
+
SELECT {', '.join(exprs)}
|
|
470
|
+
FROM {table}
|
|
471
|
+
TABLESAMPLE ({sample_pct} PERCENT)
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
cursor = self._conn.cursor()
|
|
476
|
+
cursor.execute(sql)
|
|
477
|
+
row = cursor.fetchone()
|
|
478
|
+
col_names = [desc[0] for desc in cursor.description]
|
|
479
|
+
result = dict(zip(col_names, row)) if row else {}
|
|
480
|
+
|
|
481
|
+
# Check if we got data - if all values are None, fall back to full scan
|
|
482
|
+
if all(v is None for v in result.values()):
|
|
483
|
+
return self.execute_stats_query(exprs)
|
|
484
|
+
|
|
485
|
+
return result
|
|
486
|
+
except Exception:
|
|
487
|
+
# Fall back to full query if TABLESAMPLE fails
|
|
488
|
+
return self.execute_stats_query(exprs)
|
|
489
|
+
|
|
490
|
+
def fetch_low_cardinality_values_batched(
|
|
491
|
+
self, columns: List[str]
|
|
492
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
493
|
+
"""
|
|
494
|
+
Fetch value distributions for multiple low-cardinality columns in one query.
|
|
495
|
+
|
|
496
|
+
Uses UNION ALL to batch multiple GROUP BY queries into a single round-trip.
|
|
497
|
+
"""
|
|
498
|
+
if not columns:
|
|
499
|
+
return {}
|
|
500
|
+
|
|
501
|
+
table = self._qualified_table()
|
|
502
|
+
parts = []
|
|
503
|
+
for col in columns:
|
|
504
|
+
c = self.esc_ident(col)
|
|
505
|
+
parts.append(f"""
|
|
506
|
+
SELECT '{col}' AS col_name, CAST({c} AS NVARCHAR(MAX)) AS val, COUNT(*) AS cnt
|
|
507
|
+
FROM {table}
|
|
508
|
+
WHERE {c} IS NOT NULL
|
|
509
|
+
GROUP BY {c}
|
|
510
|
+
""")
|
|
511
|
+
|
|
512
|
+
sql = " UNION ALL ".join(parts) + " ORDER BY col_name, cnt DESC"
|
|
513
|
+
|
|
514
|
+
result: Dict[str, List[Tuple[Any, int]]] = {col: [] for col in columns}
|
|
515
|
+
try:
|
|
516
|
+
cursor = self._conn.cursor()
|
|
517
|
+
cursor.execute(sql)
|
|
518
|
+
for row in cursor.fetchall():
|
|
519
|
+
col_name, val, cnt = row
|
|
520
|
+
if col_name in result:
|
|
521
|
+
result[col_name].append((val, int(cnt)))
|
|
522
|
+
except Exception:
|
|
523
|
+
pass
|
|
524
|
+
|
|
525
|
+
return result
|
|
526
|
+
|
|
527
|
+
def classify_columns(
|
|
528
|
+
self, schema: List[Tuple[str, str]], row_count: int
|
|
529
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
530
|
+
"""
|
|
531
|
+
Classify columns based on histogram metadata for strategic profiling.
|
|
532
|
+
|
|
533
|
+
Classification rules:
|
|
534
|
+
- low: distinct < 20 → fetch all via GROUP BY
|
|
535
|
+
- medium: distinct 20-10000 → sample for top values
|
|
536
|
+
- high: distinct > 10000 → trust histogram only
|
|
537
|
+
"""
|
|
538
|
+
# First get metadata
|
|
539
|
+
metadata = self.profile_metadata_only(schema, row_count)
|
|
540
|
+
|
|
541
|
+
result = {}
|
|
542
|
+
for col_name, raw_type in schema:
|
|
543
|
+
col_meta = metadata.get(col_name, {})
|
|
544
|
+
distinct_count = col_meta.get("distinct_count", 0)
|
|
545
|
+
|
|
546
|
+
# If we don't have distinct count, estimate from row_count
|
|
547
|
+
if distinct_count == 0:
|
|
548
|
+
distinct_count = row_count # Assume high cardinality
|
|
549
|
+
|
|
550
|
+
# Classify cardinality
|
|
551
|
+
if distinct_count < 20:
|
|
552
|
+
cardinality = "low"
|
|
553
|
+
strategy = "group_by"
|
|
554
|
+
elif distinct_count <= 10000:
|
|
555
|
+
cardinality = "medium"
|
|
556
|
+
strategy = "sample"
|
|
557
|
+
else:
|
|
558
|
+
cardinality = "high"
|
|
559
|
+
strategy = "metadata_only"
|
|
560
|
+
|
|
561
|
+
result[col_name] = {
|
|
562
|
+
"cardinality": cardinality,
|
|
563
|
+
"distinct_count": distinct_count,
|
|
564
|
+
"strategy": strategy,
|
|
565
|
+
"dtype": normalize_dtype(raw_type),
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
return result
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def normalize_sqlserver_type(raw_type: str) -> str:
|
|
572
|
+
"""
|
|
573
|
+
Normalize a SQL Server type to a simplified type name.
|
|
574
|
+
|
|
575
|
+
This is an alias for the shared normalize_dtype function.
|
|
576
|
+
"""
|
|
577
|
+
return normalize_dtype(raw_type)
|