kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,359 @@
1
+ # src/kontra/scout/backends/duckdb_backend.py
2
+ """
3
+ DuckDB backend for Scout profiler.
4
+
5
+ Supports Parquet and CSV files (local + S3/HTTP).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ import duckdb
14
+
15
+ try:
16
+ import pyarrow.parquet as pq
17
+ import pyarrow.fs as pafs
18
+
19
+ _HAS_PYARROW = True
20
+ except ImportError:
21
+ _HAS_PYARROW = False
22
+
23
+ from kontra.connectors.handle import DatasetHandle
24
+ from kontra.engine.backends.duckdb_session import create_duckdb_connection
25
+ from kontra.engine.backends.duckdb_utils import esc_ident as duckdb_esc_ident
26
+ from kontra.engine.backends.duckdb_utils import lit_str
27
+
28
+
29
+ class DuckDBBackend:
30
+ """
31
+ DuckDB-based profiler backend for Parquet and CSV files.
32
+
33
+ Features:
34
+ - Parquet metadata extraction (row count from footer)
35
+ - Single-pass aggregation queries
36
+ - Sampling support
37
+ - S3/HTTP support via DuckDB httpfs
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ handle: DatasetHandle,
43
+ *,
44
+ sample_size: Optional[int] = None,
45
+ ):
46
+ self.handle = handle
47
+ self.sample_size = sample_size
48
+ self.con: Optional[duckdb.DuckDBPyConnection] = None
49
+ self._parquet_metadata: Optional[Any] = None
50
+ self._view_name = "_scout"
51
+
52
+ def connect(self) -> None:
53
+ """Create DuckDB connection and source view."""
54
+ self.con = create_duckdb_connection(self.handle)
55
+ self._create_source_view()
56
+
57
+ def close(self) -> None:
58
+ """Clean up resources."""
59
+ if self.con:
60
+ try:
61
+ self.con.execute(f"DROP VIEW IF EXISTS {self._view_name}")
62
+ except Exception:
63
+ pass
64
+
65
+ def get_schema(self) -> List[Tuple[str, str]]:
66
+ """Return [(column_name, raw_type), ...]"""
67
+ cur = self.con.execute(f"SELECT * FROM {self._view_name} LIMIT 0")
68
+ return [(d[0], str(d[1])) for d in cur.description]
69
+
70
+ def get_row_count(self) -> int:
71
+ """
72
+ Get row count, using Parquet metadata if available.
73
+
74
+ For Parquet files, the row count is extracted from the footer
75
+ without scanning data (fast). For CSV/other formats, a COUNT query is used.
76
+ """
77
+ # Try Parquet metadata first (no scan)
78
+ if self.handle.format == "parquet" and _HAS_PYARROW and self.sample_size is None:
79
+ try:
80
+ meta = self._get_parquet_metadata()
81
+ if meta:
82
+ if os.getenv("KONTRA_VERBOSE"):
83
+ print(f"[INFO] Parquet metadata: {meta.num_rows} rows from footer")
84
+ return meta.num_rows
85
+ except Exception:
86
+ pass
87
+
88
+ # Fall back to query
89
+ result = self.con.execute(f"SELECT COUNT(*) FROM {self._view_name}").fetchone()
90
+ return int(result[0]) if result else 0
91
+
92
+ def get_estimated_size_bytes(self) -> Optional[int]:
93
+ """Get estimated size from Parquet metadata."""
94
+ if self.handle.format == "parquet" and _HAS_PYARROW:
95
+ try:
96
+ meta = self._get_parquet_metadata()
97
+ if meta:
98
+ return meta.serialized_size
99
+ except Exception:
100
+ pass
101
+ return None
102
+
103
+ def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
104
+ """Execute aggregation query with multiple expressions."""
105
+ if not exprs:
106
+ return {}
107
+
108
+ sql = f"SELECT {', '.join(exprs)} FROM {self._view_name}"
109
+ cur = self.con.execute(sql)
110
+ row = cur.fetchone()
111
+ col_names = [d[0] for d in cur.description]
112
+ return dict(zip(col_names, row)) if row else {}
113
+
114
+ def fetch_top_values(self, column: str, limit: int) -> List[Tuple[Any, int]]:
115
+ """Fetch top N most frequent values."""
116
+ col = self.esc_ident(column)
117
+ sql = f"""
118
+ SELECT {col} AS val, COUNT(*) AS cnt
119
+ FROM {self._view_name}
120
+ WHERE {col} IS NOT NULL
121
+ GROUP BY {col}
122
+ ORDER BY cnt DESC
123
+ LIMIT {limit}
124
+ """
125
+ try:
126
+ rows = self.con.execute(sql).fetchall()
127
+ return [(r[0], int(r[1])) for r in rows]
128
+ except Exception:
129
+ return []
130
+
131
+ def fetch_distinct_values(self, column: str) -> List[Any]:
132
+ """Fetch all distinct values for a column."""
133
+ col = self.esc_ident(column)
134
+ sql = f"""
135
+ SELECT DISTINCT {col}
136
+ FROM {self._view_name}
137
+ WHERE {col} IS NOT NULL
138
+ ORDER BY {col}
139
+ """
140
+ try:
141
+ rows = self.con.execute(sql).fetchall()
142
+ return [r[0] for r in rows]
143
+ except Exception:
144
+ return []
145
+
146
+ def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
147
+ """Fetch sample values for pattern detection."""
148
+ col = self.esc_ident(column)
149
+ sql = f"""
150
+ SELECT {col}
151
+ FROM {self._view_name}
152
+ WHERE {col} IS NOT NULL
153
+ LIMIT {limit}
154
+ """
155
+ try:
156
+ rows = self.con.execute(sql).fetchall()
157
+ return [r[0] for r in rows if r[0] is not None]
158
+ except Exception:
159
+ return []
160
+
161
+ def esc_ident(self, name: str) -> str:
162
+ """Escape identifier for DuckDB."""
163
+ return duckdb_esc_ident(name)
164
+
165
+ @property
166
+ def source_format(self) -> str:
167
+ """Return source format."""
168
+ return self.handle.format or "unknown"
169
+
170
+ # ----------------------------- Internal methods -----------------------------
171
+
172
+ def _create_source_view(self) -> None:
173
+ """Create a DuckDB view over the source, optionally with sampling."""
174
+ fmt = (self.handle.format or "").lower()
175
+ uri = self.handle.uri
176
+
177
+ if fmt == "parquet":
178
+ read_fn = f"read_parquet({lit_str(uri)})"
179
+ elif fmt == "csv":
180
+ read_fn = f"read_csv_auto({lit_str(uri)})"
181
+ else:
182
+ # Try parquet first
183
+ read_fn = f"read_parquet({lit_str(uri)})"
184
+
185
+ if self.sample_size:
186
+ sql = f"""
187
+ CREATE OR REPLACE VIEW {self._view_name} AS
188
+ SELECT * FROM {read_fn}
189
+ USING SAMPLE {int(self.sample_size)} ROWS
190
+ """
191
+ else:
192
+ sql = f"CREATE OR REPLACE VIEW {self._view_name} AS SELECT * FROM {read_fn}"
193
+
194
+ self.con.execute(sql)
195
+
196
+ def _get_parquet_metadata(self) -> Optional[Any]:
197
+ """Extract Parquet metadata without reading data."""
198
+ if not _HAS_PYARROW:
199
+ return None
200
+
201
+ if self._parquet_metadata is not None:
202
+ return self._parquet_metadata
203
+
204
+ try:
205
+ uri = self.handle.uri
206
+ fs = None
207
+
208
+ # Handle S3
209
+ if self.handle.scheme == "s3":
210
+ opts = self.handle.fs_opts or {}
211
+ kwargs: Dict[str, Any] = {}
212
+ if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
213
+ kwargs["access_key"] = opts["s3_access_key_id"]
214
+ kwargs["secret_key"] = opts["s3_secret_access_key"]
215
+ if opts.get("s3_endpoint"):
216
+ endpoint = opts["s3_endpoint"]
217
+ if endpoint.startswith("http://"):
218
+ endpoint = endpoint[7:]
219
+ kwargs["scheme"] = "http"
220
+ elif endpoint.startswith("https://"):
221
+ endpoint = endpoint[8:]
222
+ kwargs["scheme"] = "https"
223
+ kwargs["endpoint_override"] = endpoint
224
+ if opts.get("s3_url_style", "").lower() == "path" or opts.get("s3_endpoint"):
225
+ kwargs["force_virtual_addressing"] = False
226
+
227
+ fs = pafs.S3FileSystem(**kwargs)
228
+ if uri.lower().startswith("s3://"):
229
+ uri = uri[5:]
230
+
231
+ # Handle Azure (ADLS Gen2, Azure Blob)
232
+ if self.handle.scheme in ("abfs", "abfss", "az"):
233
+ opts = self.handle.fs_opts or {}
234
+ kwargs: Dict[str, Any] = {}
235
+
236
+ if opts.get("azure_account_name"):
237
+ kwargs["account_name"] = opts["azure_account_name"]
238
+ if opts.get("azure_account_key"):
239
+ kwargs["account_key"] = opts["azure_account_key"]
240
+ if opts.get("azure_sas_token"):
241
+ # PyArrow expects SAS token as 'sas_token' credential
242
+ sas = opts["azure_sas_token"]
243
+ if sas.startswith("?"):
244
+ sas = sas[1:]
245
+ kwargs["sas_token"] = sas
246
+
247
+ try:
248
+ fs = pafs.AzureFileSystem(**kwargs)
249
+ # Strip scheme prefix for PyArrow
250
+ if uri.lower().startswith("abfss://"):
251
+ uri = uri[8:]
252
+ elif uri.lower().startswith("abfs://"):
253
+ uri = uri[7:]
254
+ elif uri.lower().startswith("az://"):
255
+ uri = uri[5:]
256
+ except Exception:
257
+ # Azure filesystem not available or credentials invalid
258
+ # Fall back to DuckDB-based profiling
259
+ return None
260
+
261
+ pf = pq.ParquetFile(uri, filesystem=fs)
262
+ self._parquet_metadata = pf.metadata
263
+ return self._parquet_metadata
264
+
265
+ except Exception:
266
+ return None
267
+
268
+ def supports_metadata_only(self) -> bool:
269
+ """
270
+ Check if this backend supports metadata-only profiling.
271
+
272
+ Returns True only for Parquet files when PyArrow is available.
273
+ CSV files don't have metadata statistics.
274
+ """
275
+ return (
276
+ self.handle.format == "parquet"
277
+ and _HAS_PYARROW
278
+ and self.sample_size is None
279
+ )
280
+
281
+ def profile_metadata_only(
282
+ self, schema: List[Tuple[str, str]], row_count: int
283
+ ) -> Dict[str, Dict[str, Any]]:
284
+ """
285
+ Profile columns using only Parquet metadata (no data scan).
286
+
287
+ Returns dict mapping column_name -> {null_count, distinct_count, ...}
288
+
289
+ Parquet row group statistics provide:
290
+ - null_count: Exact count of nulls (sum across row groups)
291
+ - num_values: Non-null values per row group
292
+ - min/max: Column min/max (for potential use)
293
+
294
+ Note: Parquet does NOT store distinct_count. We estimate from
295
+ num_values (assuming all non-null values are distinct as upper bound).
296
+
297
+ This is used for the 'lite' preset to achieve fast profiling
298
+ without scanning the actual data.
299
+ """
300
+ meta = self._get_parquet_metadata()
301
+ if not meta:
302
+ raise RuntimeError("Cannot get Parquet metadata")
303
+
304
+ # Build column stats by aggregating across row groups
305
+ col_stats: Dict[str, Dict[str, Any]] = {}
306
+
307
+ # Initialize stats for each column
308
+ for col_name, _ in schema:
309
+ col_stats[col_name] = {
310
+ "null_count": 0,
311
+ "num_values": 0,
312
+ "has_statistics": False,
313
+ }
314
+
315
+ # Aggregate stats from all row groups
316
+ for rg_idx in range(meta.num_row_groups):
317
+ rg = meta.row_group(rg_idx)
318
+
319
+ for col_idx in range(rg.num_columns):
320
+ col_chunk = rg.column(col_idx)
321
+ # Get column name from path (handles nested columns)
322
+ col_path = col_chunk.path_in_schema
323
+ col_name = col_path.split(".")[-1] if "." in col_path else col_path
324
+
325
+ if col_name not in col_stats:
326
+ continue
327
+
328
+ stats = col_chunk.statistics
329
+ if stats is not None:
330
+ col_stats[col_name]["has_statistics"] = True
331
+ if stats.null_count is not None:
332
+ col_stats[col_name]["null_count"] += stats.null_count
333
+ if stats.num_values is not None:
334
+ col_stats[col_name]["num_values"] += stats.num_values
335
+
336
+ # Build result dict
337
+ result: Dict[str, Dict[str, Any]] = {}
338
+
339
+ for col_name, raw_type in schema:
340
+ stats = col_stats.get(col_name, {})
341
+
342
+ null_count = stats.get("null_count", 0)
343
+ num_values = stats.get("num_values", 0)
344
+ has_stats = stats.get("has_statistics", False)
345
+
346
+ # Estimate distinct_count:
347
+ # - If no stats: use non-null count as upper bound
348
+ # - Parquet doesn't track distinct count
349
+ non_null = row_count - null_count if has_stats else row_count
350
+ distinct_count = non_null # Upper bound estimate
351
+
352
+ result[col_name] = {
353
+ "null_count": null_count if has_stats else 0,
354
+ "distinct_count": distinct_count,
355
+ "has_statistics": has_stats,
356
+ "is_estimate": True, # Flag that distinct_count is estimated
357
+ }
358
+
359
+ return result