kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/__init__.py
ADDED
|
@@ -0,0 +1,1871 @@
|
|
|
1
|
+
# src/kontra/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Kontra - Developer-first Data Quality Engine
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
# CLI
|
|
7
|
+
$ kontra validate contract.yml
|
|
8
|
+
$ kontra profile data.parquet
|
|
9
|
+
|
|
10
|
+
# Python API - Simple validation
|
|
11
|
+
import kontra
|
|
12
|
+
result = kontra.validate(df, "contract.yml")
|
|
13
|
+
if result.passed:
|
|
14
|
+
print("All rules passed!")
|
|
15
|
+
|
|
16
|
+
# Python API - Inline rules
|
|
17
|
+
from kontra import rules
|
|
18
|
+
result = kontra.validate(df, rules=[
|
|
19
|
+
rules.not_null("user_id"),
|
|
20
|
+
rules.unique("email"),
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
# Python API - Profile data
|
|
24
|
+
profile = kontra.profile(df)
|
|
25
|
+
print(profile)
|
|
26
|
+
|
|
27
|
+
# Python API - Draft rules from profile
|
|
28
|
+
suggestions = kontra.draft(profile)
|
|
29
|
+
suggestions.save("contracts/users.yml")
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from kontra.version import VERSION as __version__
|
|
33
|
+
|
|
34
|
+
# Type imports
|
|
35
|
+
from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING
|
|
36
|
+
|
|
37
|
+
import json
|
|
38
|
+
import os
|
|
39
|
+
import polars as pl
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
import pandas as pd
|
|
43
|
+
|
|
44
|
+
# Core engine (for advanced usage)
|
|
45
|
+
from kontra.engine.engine import ValidationEngine
|
|
46
|
+
|
|
47
|
+
# Scout profiler (for advanced usage)
|
|
48
|
+
from kontra.scout.profiler import ScoutProfiler
|
|
49
|
+
|
|
50
|
+
# Scout types
|
|
51
|
+
from kontra.scout.types import DatasetProfile, ColumnProfile, ProfileDiff
|
|
52
|
+
|
|
53
|
+
# Logging
|
|
54
|
+
from kontra.logging import get_logger, log_exception
|
|
55
|
+
|
|
56
|
+
_logger = get_logger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _is_pandas_dataframe(obj: Any) -> bool:
|
|
60
|
+
"""Check if object is a pandas DataFrame without importing pandas."""
|
|
61
|
+
# Check module name to avoid importing pandas
|
|
62
|
+
return type(obj).__module__.startswith("pandas") and type(obj).__name__ == "DataFrame"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Data file extensions that should not be passed to state functions
|
|
66
|
+
_DATA_FILE_EXTENSIONS = {".parquet", ".csv", ".json", ".ndjson", ".jsonl", ".arrow", ".feather"}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _validate_contract_path(path: str, function_name: str) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Validate that a path looks like a contract file, not a data file.
|
|
72
|
+
|
|
73
|
+
Raises ValueError with a helpful message if the file appears to be a data file.
|
|
74
|
+
"""
|
|
75
|
+
lower = path.lower()
|
|
76
|
+
for ext in _DATA_FILE_EXTENSIONS:
|
|
77
|
+
if lower.endswith(ext):
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"{function_name}() requires a contract YAML file path, not a data file. "
|
|
80
|
+
f"Received: '{path}' (appears to be a {ext[1:].upper()} file). "
|
|
81
|
+
f"Example: kontra.{function_name}('contract.yml')"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# API types
|
|
86
|
+
from kontra.api.results import (
|
|
87
|
+
ValidationResult,
|
|
88
|
+
RuleResult,
|
|
89
|
+
DryRunResult,
|
|
90
|
+
Diff,
|
|
91
|
+
Suggestions,
|
|
92
|
+
SuggestedRule,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Probe types
|
|
96
|
+
from kontra.api.compare import CompareResult, RelationshipProfile
|
|
97
|
+
|
|
98
|
+
# Transformation probes
|
|
99
|
+
from kontra.probes import compare, profile_relationship
|
|
100
|
+
|
|
101
|
+
# Rules helpers
|
|
102
|
+
from kontra.api.rules import rules
|
|
103
|
+
|
|
104
|
+
# Decorators
|
|
105
|
+
from kontra.api.decorators import validate as validate_decorator
|
|
106
|
+
|
|
107
|
+
# Errors
|
|
108
|
+
from kontra.errors import ValidationError, StateCorruptedError
|
|
109
|
+
|
|
110
|
+
# Configuration
|
|
111
|
+
from kontra.config.settings import (
|
|
112
|
+
resolve_datasource,
|
|
113
|
+
resolve_effective_config,
|
|
114
|
+
list_datasources,
|
|
115
|
+
KontraConfig,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# =============================================================================
|
|
120
|
+
# Core Functions
|
|
121
|
+
# =============================================================================
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def validate(
|
|
125
|
+
data: Union[str, pl.DataFrame, "pd.DataFrame", List[Dict[str, Any]], Dict[str, Any], Any],
|
|
126
|
+
contract: Optional[str] = None,
|
|
127
|
+
*,
|
|
128
|
+
table: Optional[str] = None,
|
|
129
|
+
rules: Optional[List[Dict[str, Any]]] = None,
|
|
130
|
+
emit_report: bool = False,
|
|
131
|
+
save: bool = True,
|
|
132
|
+
preplan: str = "auto",
|
|
133
|
+
pushdown: str = "auto",
|
|
134
|
+
projection: bool = True,
|
|
135
|
+
csv_mode: str = "auto",
|
|
136
|
+
env: Optional[str] = None,
|
|
137
|
+
stats: str = "none",
|
|
138
|
+
dry_run: bool = False,
|
|
139
|
+
sample: int = 0,
|
|
140
|
+
sample_budget: int = 50,
|
|
141
|
+
sample_columns: Optional[Union[List[str], str]] = None,
|
|
142
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
143
|
+
**kwargs,
|
|
144
|
+
) -> Union[ValidationResult, DryRunResult]:
|
|
145
|
+
"""
|
|
146
|
+
Validate data against a contract and/or inline rules.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
data: Data to validate. Accepts:
|
|
150
|
+
- str: File path, URI, or named datasource (e.g., "data.parquet", "s3://...", "prod_db.users")
|
|
151
|
+
- DataFrame: Polars or pandas DataFrame
|
|
152
|
+
- list[dict]: Flat tabular JSON (e.g., API response data)
|
|
153
|
+
- dict: Single record (converted to 1-row DataFrame)
|
|
154
|
+
- Database connection: psycopg2/pyodbc/SQLAlchemy connection (requires `table` param)
|
|
155
|
+
table: Table name for BYOC (Bring Your Own Connection) pattern.
|
|
156
|
+
Required when `data` is a database connection object.
|
|
157
|
+
Formats: "table", "schema.table", or "database.schema.table"
|
|
158
|
+
contract: Path to contract YAML file (optional if rules provided)
|
|
159
|
+
rules: List of inline rule dicts (optional if contract provided)
|
|
160
|
+
emit_report: Print validation report to console
|
|
161
|
+
save: Save result to history (default: True)
|
|
162
|
+
preplan: "on" | "off" | "auto"
|
|
163
|
+
pushdown: "on" | "off" | "auto"
|
|
164
|
+
projection: Enable column pruning
|
|
165
|
+
csv_mode: "auto" | "duckdb" | "parquet"
|
|
166
|
+
env: Environment name from config
|
|
167
|
+
stats: "none" | "summary" | "profile"
|
|
168
|
+
dry_run: If True, validate contract/rules syntax without executing
|
|
169
|
+
against data. Returns DryRunResult with .valid, .rules_count,
|
|
170
|
+
.columns_needed. Use to check contracts before running.
|
|
171
|
+
sample: Per-rule sample cap for failing rows (default: 0 disabled, set to 5 to enable)
|
|
172
|
+
sample_budget: Global sample cap across all rules (default: 50)
|
|
173
|
+
sample_columns: Columns to include in samples for token efficiency.
|
|
174
|
+
- None (default): All columns
|
|
175
|
+
- ["col1", "col2"]: Only specified columns
|
|
176
|
+
- "relevant": Rule's columns + _row_index only
|
|
177
|
+
storage_options: Cloud storage credentials (S3, Azure, GCS).
|
|
178
|
+
For S3/MinIO:
|
|
179
|
+
- aws_access_key_id, aws_secret_access_key
|
|
180
|
+
- aws_region (required for Polars)
|
|
181
|
+
- endpoint_url (for MinIO/S3-compatible)
|
|
182
|
+
For Azure:
|
|
183
|
+
- account_name, account_key, sas_token, etc.
|
|
184
|
+
These override environment variables when provided.
|
|
185
|
+
**kwargs: Additional arguments passed to ValidationEngine
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
ValidationResult with .passed, .rules, .to_llm(), etc.
|
|
189
|
+
DryRunResult if dry_run=True, with .valid, .rules_count, .columns_needed
|
|
190
|
+
|
|
191
|
+
Example:
|
|
192
|
+
# With contract file
|
|
193
|
+
result = kontra.validate(df, "contract.yml")
|
|
194
|
+
|
|
195
|
+
# With inline rules
|
|
196
|
+
from kontra import rules
|
|
197
|
+
result = kontra.validate(df, rules=[
|
|
198
|
+
rules.not_null("user_id"),
|
|
199
|
+
rules.unique("email"),
|
|
200
|
+
])
|
|
201
|
+
|
|
202
|
+
# With list of dicts (e.g., API response)
|
|
203
|
+
data = [{"id": 1, "email": "a@b.com"}, {"id": 2, "email": "c@d.com"}]
|
|
204
|
+
result = kontra.validate(data, rules=[rules.not_null("email")])
|
|
205
|
+
|
|
206
|
+
# With single dict (single record validation)
|
|
207
|
+
record = {"id": 1, "email": "test@example.com"}
|
|
208
|
+
result = kontra.validate(record, rules=[rules.regex("email", r".*@.*")])
|
|
209
|
+
|
|
210
|
+
# BYOC (Bring Your Own Connection) - database connection + table
|
|
211
|
+
import psycopg2
|
|
212
|
+
conn = psycopg2.connect(host="localhost", dbname="mydb")
|
|
213
|
+
result = kontra.validate(conn, table="public.users", rules=[
|
|
214
|
+
rules.not_null("user_id"),
|
|
215
|
+
])
|
|
216
|
+
# Note: Kontra does NOT close your connection. You manage its lifecycle.
|
|
217
|
+
|
|
218
|
+
# Mix contract and inline rules
|
|
219
|
+
result = kontra.validate(df, "base.yml", rules=[
|
|
220
|
+
rules.freshness("updated_at", max_age="24h"),
|
|
221
|
+
])
|
|
222
|
+
|
|
223
|
+
# Check result
|
|
224
|
+
if result.passed:
|
|
225
|
+
print("All rules passed!")
|
|
226
|
+
else:
|
|
227
|
+
for r in result.blocking_failures:
|
|
228
|
+
print(f"FAILED: {r.rule_id}")
|
|
229
|
+
|
|
230
|
+
# Dry run - validate contract syntax without running
|
|
231
|
+
check = kontra.validate(df, "contract.yml", dry_run=True)
|
|
232
|
+
if check.valid:
|
|
233
|
+
print(f"Contract OK: {check.rules_count} rules, needs columns: {check.columns_needed}")
|
|
234
|
+
else:
|
|
235
|
+
print(f"Contract errors: {check.errors}")
|
|
236
|
+
"""
|
|
237
|
+
from kontra.errors import InvalidDataError, InvalidPathError
|
|
238
|
+
from kontra.connectors.detection import is_database_connection, is_cursor_object
|
|
239
|
+
|
|
240
|
+
# ==========================================================================
|
|
241
|
+
# Input validation - catch invalid data types early with clear errors
|
|
242
|
+
# ==========================================================================
|
|
243
|
+
|
|
244
|
+
# Validate inputs
|
|
245
|
+
if contract is None and rules is None:
|
|
246
|
+
raise ValueError("Either contract or rules must be provided")
|
|
247
|
+
|
|
248
|
+
# ==========================================================================
|
|
249
|
+
# Dry run - validate contract/rules syntax without executing
|
|
250
|
+
# Data can be None for dry_run since we're not actually validating
|
|
251
|
+
# ==========================================================================
|
|
252
|
+
if dry_run:
|
|
253
|
+
from kontra.config.loader import ContractLoader
|
|
254
|
+
from kontra.rules.factory import RuleFactory
|
|
255
|
+
from kontra.rules.execution_plan import RuleExecutionPlan
|
|
256
|
+
|
|
257
|
+
errors: List[str] = []
|
|
258
|
+
contract_name: Optional[str] = None
|
|
259
|
+
datasource: Optional[str] = None
|
|
260
|
+
all_rule_specs: List[Any] = []
|
|
261
|
+
|
|
262
|
+
# Load contract if provided
|
|
263
|
+
if contract is not None:
|
|
264
|
+
try:
|
|
265
|
+
contract_obj = ContractLoader.from_path(contract)
|
|
266
|
+
contract_name = contract_obj.name
|
|
267
|
+
datasource = contract_obj.datasource
|
|
268
|
+
all_rule_specs.extend(contract_obj.rules)
|
|
269
|
+
except FileNotFoundError as e:
|
|
270
|
+
errors.append(f"Contract not found: {e}")
|
|
271
|
+
except ValueError as e:
|
|
272
|
+
errors.append(f"Contract parse error: {e}")
|
|
273
|
+
except Exception as e:
|
|
274
|
+
errors.append(f"Contract error: {e}")
|
|
275
|
+
|
|
276
|
+
# Add inline rules if provided
|
|
277
|
+
inline_built_rules = [] # Already-built BaseRule instances
|
|
278
|
+
if rules is not None:
|
|
279
|
+
# Convert inline rules to RuleSpec format (or pass through BaseRule instances)
|
|
280
|
+
from kontra.config.models import RuleSpec
|
|
281
|
+
from kontra.rules.base import BaseRule as BaseRuleType
|
|
282
|
+
for i, r in enumerate(rules):
|
|
283
|
+
try:
|
|
284
|
+
if isinstance(r, BaseRuleType):
|
|
285
|
+
# Already a rule instance - use directly
|
|
286
|
+
inline_built_rules.append(r)
|
|
287
|
+
elif isinstance(r, dict):
|
|
288
|
+
spec = RuleSpec(
|
|
289
|
+
name=r.get("name", ""),
|
|
290
|
+
id=r.get("id"),
|
|
291
|
+
params=r.get("params", {}),
|
|
292
|
+
severity=r.get("severity", "blocking"),
|
|
293
|
+
)
|
|
294
|
+
all_rule_specs.append(spec)
|
|
295
|
+
else:
|
|
296
|
+
errors.append(
|
|
297
|
+
f"Inline rule {i}: expected dict or BaseRule, "
|
|
298
|
+
f"got {type(r).__name__}"
|
|
299
|
+
)
|
|
300
|
+
except Exception as e:
|
|
301
|
+
errors.append(f"Inline rule {i} error: {e}")
|
|
302
|
+
|
|
303
|
+
# Try to build rules and extract required columns
|
|
304
|
+
columns_needed: List[str] = []
|
|
305
|
+
rules_count = 0
|
|
306
|
+
|
|
307
|
+
if not errors and (all_rule_specs or inline_built_rules):
|
|
308
|
+
try:
|
|
309
|
+
built_rules = RuleFactory(all_rule_specs).build_rules() if all_rule_specs else []
|
|
310
|
+
# Merge with already-built rule instances
|
|
311
|
+
built_rules = list(built_rules) + inline_built_rules
|
|
312
|
+
rules_count = len(built_rules)
|
|
313
|
+
|
|
314
|
+
# Extract required columns
|
|
315
|
+
plan = RuleExecutionPlan(built_rules)
|
|
316
|
+
compiled = plan.compile()
|
|
317
|
+
columns_needed = list(compiled.required_cols or [])
|
|
318
|
+
except Exception as e:
|
|
319
|
+
errors.append(f"Rule build error: {e}")
|
|
320
|
+
|
|
321
|
+
return DryRunResult(
|
|
322
|
+
valid=len(errors) == 0,
|
|
323
|
+
rules_count=rules_count,
|
|
324
|
+
columns_needed=columns_needed,
|
|
325
|
+
contract_name=contract_name,
|
|
326
|
+
datasource=datasource,
|
|
327
|
+
errors=errors,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# ==========================================================================
|
|
331
|
+
# Input validation for actual validation (not dry_run)
|
|
332
|
+
# ==========================================================================
|
|
333
|
+
|
|
334
|
+
# Check for None
|
|
335
|
+
if data is None:
|
|
336
|
+
raise InvalidDataError("NoneType", detail="Data cannot be None")
|
|
337
|
+
|
|
338
|
+
# Check for cursor instead of connection (common mistake)
|
|
339
|
+
if is_cursor_object(data):
|
|
340
|
+
raise InvalidDataError(
|
|
341
|
+
type(data).__name__,
|
|
342
|
+
detail="Expected database connection, got cursor object. Pass the connection, not the cursor."
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Check for BYOC pattern: connection object + table
|
|
346
|
+
|
|
347
|
+
is_byoc = False
|
|
348
|
+
if is_database_connection(data):
|
|
349
|
+
if table is None:
|
|
350
|
+
raise ValueError(
|
|
351
|
+
"When passing a database connection, the 'table' parameter is required.\n"
|
|
352
|
+
"Example: kontra.validate(conn, table='public.users', rules=[...])"
|
|
353
|
+
)
|
|
354
|
+
is_byoc = True
|
|
355
|
+
elif table is not None:
|
|
356
|
+
raise ValueError(
|
|
357
|
+
"The 'table' parameter is only valid when 'data' is a database connection.\n"
|
|
358
|
+
"For other data types, use file paths, URIs, or named datasources."
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Resolve config (always, for severity_weights and other settings)
|
|
362
|
+
cfg = resolve_effective_config(env_name=env)
|
|
363
|
+
|
|
364
|
+
# Apply config defaults (CLI args take precedence)
|
|
365
|
+
if env:
|
|
366
|
+
if preplan == "auto" and cfg.preplan:
|
|
367
|
+
preplan = cfg.preplan
|
|
368
|
+
if pushdown == "auto" and cfg.pushdown:
|
|
369
|
+
pushdown = cfg.pushdown
|
|
370
|
+
|
|
371
|
+
# Build engine kwargs
|
|
372
|
+
engine_kwargs = {
|
|
373
|
+
"contract_path": contract,
|
|
374
|
+
"emit_report": emit_report,
|
|
375
|
+
"save_state": save,
|
|
376
|
+
"preplan": preplan,
|
|
377
|
+
"pushdown": pushdown,
|
|
378
|
+
"enable_projection": projection,
|
|
379
|
+
"csv_mode": csv_mode,
|
|
380
|
+
"stats_mode": stats,
|
|
381
|
+
"inline_rules": rules,
|
|
382
|
+
"storage_options": storage_options,
|
|
383
|
+
**kwargs,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
# Normalize and create engine
|
|
387
|
+
if is_byoc:
|
|
388
|
+
# BYOC: database connection + table
|
|
389
|
+
from kontra.connectors.handle import DatasetHandle
|
|
390
|
+
|
|
391
|
+
handle = DatasetHandle.from_connection(data, table)
|
|
392
|
+
engine = ValidationEngine(handle=handle, **engine_kwargs)
|
|
393
|
+
elif isinstance(data, str):
|
|
394
|
+
# File path/URI or datasource name
|
|
395
|
+
# Validate: check if it's a directory (common mistake)
|
|
396
|
+
if os.path.isdir(data):
|
|
397
|
+
raise InvalidPathError(data, "Path is a directory, not a file")
|
|
398
|
+
engine = ValidationEngine(data_path=data, **engine_kwargs)
|
|
399
|
+
elif isinstance(data, list):
|
|
400
|
+
# list[dict] - flat tabular JSON (e.g., API response)
|
|
401
|
+
if not data:
|
|
402
|
+
# Empty list - create empty DataFrame (valid for dataset-level rules like min_rows)
|
|
403
|
+
df = pl.DataFrame()
|
|
404
|
+
else:
|
|
405
|
+
df = pl.DataFrame(data)
|
|
406
|
+
engine = ValidationEngine(dataframe=df, **engine_kwargs)
|
|
407
|
+
elif isinstance(data, dict) and not isinstance(data, pl.DataFrame):
|
|
408
|
+
# Single dict - convert to 1-row DataFrame
|
|
409
|
+
# Note: check for pl.DataFrame first since it's also dict-like in some contexts
|
|
410
|
+
if not data:
|
|
411
|
+
# Empty dict - create empty DataFrame
|
|
412
|
+
df = pl.DataFrame()
|
|
413
|
+
else:
|
|
414
|
+
df = pl.DataFrame([data])
|
|
415
|
+
engine = ValidationEngine(dataframe=df, **engine_kwargs)
|
|
416
|
+
elif isinstance(data, pl.DataFrame):
|
|
417
|
+
# Polars DataFrame
|
|
418
|
+
engine = ValidationEngine(dataframe=data, **engine_kwargs)
|
|
419
|
+
elif _is_pandas_dataframe(data):
|
|
420
|
+
# pandas DataFrame - will be converted by engine
|
|
421
|
+
engine = ValidationEngine(dataframe=data, **engine_kwargs)
|
|
422
|
+
else:
|
|
423
|
+
# Invalid data type
|
|
424
|
+
raise InvalidDataError(type(data).__name__)
|
|
425
|
+
|
|
426
|
+
# Run validation
|
|
427
|
+
try:
|
|
428
|
+
raw_result = engine.run()
|
|
429
|
+
except OSError as e:
|
|
430
|
+
# Catch internal errors about unsupported formats and wrap in user-friendly error
|
|
431
|
+
error_str = str(e)
|
|
432
|
+
if "Unsupported format" in error_str or "PolarsConnectorMaterializer" in error_str:
|
|
433
|
+
# Extract the problematic value from the error
|
|
434
|
+
if isinstance(data, str):
|
|
435
|
+
raise InvalidDataError(
|
|
436
|
+
"str",
|
|
437
|
+
detail=f"'{data}' is not a valid file path, URI, or datasource name"
|
|
438
|
+
) from None
|
|
439
|
+
else:
|
|
440
|
+
raise InvalidDataError(type(data).__name__) from None
|
|
441
|
+
raise
|
|
442
|
+
|
|
443
|
+
# Determine data source for sample_failures()
|
|
444
|
+
# Priority: DataFrame > handle > data path
|
|
445
|
+
if isinstance(data, pl.DataFrame):
|
|
446
|
+
data_source = data
|
|
447
|
+
elif is_byoc:
|
|
448
|
+
# Store the handle for BYOC
|
|
449
|
+
data_source = engine._handle
|
|
450
|
+
elif isinstance(data, str):
|
|
451
|
+
data_source = data
|
|
452
|
+
else:
|
|
453
|
+
# list[dict] or dict - store as DataFrame
|
|
454
|
+
data_source = engine.df
|
|
455
|
+
|
|
456
|
+
# Determine loaded data to expose via result.data
|
|
457
|
+
# Priority: engine.df (loaded for Polars) > input DataFrame
|
|
458
|
+
if engine.df is not None:
|
|
459
|
+
loaded_data = engine.df
|
|
460
|
+
elif isinstance(data, pl.DataFrame):
|
|
461
|
+
loaded_data = data # User passed DataFrame directly
|
|
462
|
+
else:
|
|
463
|
+
loaded_data = None # Preplan/pushdown handled everything, no data loaded
|
|
464
|
+
|
|
465
|
+
# Wrap in ValidationResult with data source and rules for sample_failures()
|
|
466
|
+
return ValidationResult.from_engine_result(
|
|
467
|
+
raw_result,
|
|
468
|
+
data_source=data_source,
|
|
469
|
+
rule_objects=engine._rules,
|
|
470
|
+
sample=sample,
|
|
471
|
+
sample_budget=sample_budget,
|
|
472
|
+
sample_columns=sample_columns,
|
|
473
|
+
severity_weights=cfg.severity_weights,
|
|
474
|
+
data=loaded_data,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def profile(
|
|
479
|
+
data: Union[str, pl.DataFrame, List[Dict[str, Any]], Dict[str, Any]],
|
|
480
|
+
preset: str = "scan",
|
|
481
|
+
*,
|
|
482
|
+
columns: Optional[List[str]] = None,
|
|
483
|
+
sample: Optional[int] = None,
|
|
484
|
+
save: bool = True,
|
|
485
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
486
|
+
**kwargs,
|
|
487
|
+
) -> DatasetProfile:
|
|
488
|
+
"""
|
|
489
|
+
Profile a dataset.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
data: DataFrame (Polars), list[dict], dict, or path/URI to data file
|
|
493
|
+
preset: Profiling depth:
|
|
494
|
+
- "scout": Quick recon (metadata only)
|
|
495
|
+
- "scan": Systematic pass (full stats) [default]
|
|
496
|
+
- "interrogate": Deep investigation (everything + percentiles)
|
|
497
|
+
columns: Only profile these columns
|
|
498
|
+
sample: Sample N rows (default: all)
|
|
499
|
+
save: Save profile to history
|
|
500
|
+
storage_options: Cloud storage credentials (S3, Azure, GCS).
|
|
501
|
+
For S3/MinIO: aws_access_key_id, aws_secret_access_key, aws_region, endpoint_url
|
|
502
|
+
For Azure: account_name, account_key, sas_token, etc.
|
|
503
|
+
These override environment variables when provided.
|
|
504
|
+
**kwargs: Additional arguments passed to ScoutProfiler
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
DatasetProfile with column statistics
|
|
508
|
+
|
|
509
|
+
Example:
|
|
510
|
+
profile = kontra.profile("data.parquet")
|
|
511
|
+
print(f"Rows: {profile.row_count}")
|
|
512
|
+
for col in profile.columns:
|
|
513
|
+
print(f"{col.name}: {col.dtype}")
|
|
514
|
+
|
|
515
|
+
# Quick metadata-only profile
|
|
516
|
+
profile = kontra.profile("big_data.parquet", preset="scout")
|
|
517
|
+
|
|
518
|
+
# Deep profile with percentiles
|
|
519
|
+
profile = kontra.profile("data.parquet", preset="interrogate")
|
|
520
|
+
"""
|
|
521
|
+
import warnings
|
|
522
|
+
from kontra.scout.profiler import _DEPRECATED_PRESETS
|
|
523
|
+
|
|
524
|
+
# Warn on deprecated preset names
|
|
525
|
+
if preset in _DEPRECATED_PRESETS:
|
|
526
|
+
new_name = _DEPRECATED_PRESETS[preset]
|
|
527
|
+
warnings.warn(
|
|
528
|
+
f"Preset '{preset}' is deprecated, use '{new_name}' instead",
|
|
529
|
+
DeprecationWarning,
|
|
530
|
+
stacklevel=2,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Convert list/dict to DataFrame
|
|
534
|
+
if isinstance(data, list):
|
|
535
|
+
if not data:
|
|
536
|
+
data = pl.DataFrame()
|
|
537
|
+
else:
|
|
538
|
+
data = pl.DataFrame(data)
|
|
539
|
+
elif isinstance(data, dict) and not isinstance(data, pl.DataFrame):
|
|
540
|
+
if not data:
|
|
541
|
+
data = pl.DataFrame()
|
|
542
|
+
else:
|
|
543
|
+
data = pl.DataFrame([data])
|
|
544
|
+
|
|
545
|
+
if isinstance(data, pl.DataFrame):
|
|
546
|
+
# Handle empty DataFrame (no columns) - DuckDB can't read parquet with no columns
|
|
547
|
+
if data.width == 0:
|
|
548
|
+
from datetime import datetime, timezone
|
|
549
|
+
from kontra.version import VERSION
|
|
550
|
+
return DatasetProfile(
|
|
551
|
+
source_uri="<inline DataFrame>",
|
|
552
|
+
source_format="dataframe",
|
|
553
|
+
profiled_at=datetime.now(timezone.utc).isoformat(),
|
|
554
|
+
engine_version=VERSION,
|
|
555
|
+
row_count=data.height,
|
|
556
|
+
column_count=0,
|
|
557
|
+
columns=[],
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# For DataFrame input, write to temp file
|
|
561
|
+
import tempfile
|
|
562
|
+
import os
|
|
563
|
+
|
|
564
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
|
|
565
|
+
temp_path = f.name
|
|
566
|
+
data.write_parquet(temp_path)
|
|
567
|
+
|
|
568
|
+
try:
|
|
569
|
+
profiler = ScoutProfiler(
|
|
570
|
+
temp_path,
|
|
571
|
+
preset=preset,
|
|
572
|
+
columns=columns,
|
|
573
|
+
sample_size=sample,
|
|
574
|
+
**kwargs,
|
|
575
|
+
)
|
|
576
|
+
return profiler.profile()
|
|
577
|
+
finally:
|
|
578
|
+
os.unlink(temp_path)
|
|
579
|
+
else:
|
|
580
|
+
# Resolve named datasources (e.g., "prod_db.users" -> actual URI)
|
|
581
|
+
resolved_data = data
|
|
582
|
+
if isinstance(data, str):
|
|
583
|
+
try:
|
|
584
|
+
resolved_data = resolve_datasource(data)
|
|
585
|
+
except ValueError:
|
|
586
|
+
# Not a named datasource - use as-is (file path or URI)
|
|
587
|
+
pass
|
|
588
|
+
|
|
589
|
+
profiler = ScoutProfiler(
|
|
590
|
+
resolved_data,
|
|
591
|
+
preset=preset,
|
|
592
|
+
columns=columns,
|
|
593
|
+
sample_size=sample,
|
|
594
|
+
storage_options=storage_options,
|
|
595
|
+
**kwargs,
|
|
596
|
+
)
|
|
597
|
+
return profiler.profile()
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def draft(
|
|
601
|
+
profile: DatasetProfile,
|
|
602
|
+
min_confidence: float = 0.5,
|
|
603
|
+
) -> Suggestions:
|
|
604
|
+
"""
|
|
605
|
+
Draft validation rules from a profile.
|
|
606
|
+
|
|
607
|
+
Analyzes the profile and suggests rules based on observed patterns.
|
|
608
|
+
These are starting points - refine them based on domain knowledge.
|
|
609
|
+
|
|
610
|
+
Args:
|
|
611
|
+
profile: DatasetProfile from kontra.profile()
|
|
612
|
+
min_confidence: Minimum confidence score (0.0-1.0)
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
Suggestions with .to_yaml(), .save(), .filter()
|
|
616
|
+
|
|
617
|
+
Example:
|
|
618
|
+
profile = kontra.profile(df, preset="interrogate")
|
|
619
|
+
suggestions = kontra.draft(profile)
|
|
620
|
+
|
|
621
|
+
# Filter high confidence
|
|
622
|
+
high_conf = suggestions.filter(min_confidence=0.9)
|
|
623
|
+
|
|
624
|
+
# Save to file
|
|
625
|
+
high_conf.save("contracts/users.yml")
|
|
626
|
+
|
|
627
|
+
# Or use directly
|
|
628
|
+
result = kontra.validate(df, rules=suggestions.to_dict())
|
|
629
|
+
"""
|
|
630
|
+
return Suggestions.from_profile(profile, min_confidence=min_confidence)
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def get_history(
|
|
634
|
+
contract: str,
|
|
635
|
+
*,
|
|
636
|
+
limit: int = 20,
|
|
637
|
+
since: Optional[str] = None,
|
|
638
|
+
failed_only: bool = False,
|
|
639
|
+
) -> List[Dict[str, Any]]:
|
|
640
|
+
"""
|
|
641
|
+
Get validation history for a contract.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
contract: Path to contract YAML file
|
|
645
|
+
limit: Maximum number of runs to return (default: 20)
|
|
646
|
+
since: Only return runs after this date/time. Formats:
|
|
647
|
+
- "24h", "7d" - relative time
|
|
648
|
+
- "2026-01-15" - specific date
|
|
649
|
+
failed_only: Only return failed runs
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
List of run summaries, newest first. Each summary contains:
|
|
653
|
+
- run_id: Unique identifier
|
|
654
|
+
- timestamp: When the run occurred (ISO format)
|
|
655
|
+
- passed: Overall pass/fail
|
|
656
|
+
- failed_count: Total failures
|
|
657
|
+
- total_rows: Row count (if available)
|
|
658
|
+
- contract_name: Name of the contract
|
|
659
|
+
|
|
660
|
+
Example:
|
|
661
|
+
history = kontra.get_history("contract.yml")
|
|
662
|
+
for run in history:
|
|
663
|
+
print(f"{run['timestamp']}: {'PASS' if run['passed'] else 'FAIL'}")
|
|
664
|
+
|
|
665
|
+
# Last 7 days only
|
|
666
|
+
recent = kontra.get_history("contract.yml", since="7d")
|
|
667
|
+
|
|
668
|
+
# Only failed runs
|
|
669
|
+
failures = kontra.get_history("contract.yml", failed_only=True)
|
|
670
|
+
"""
|
|
671
|
+
from datetime import datetime, timedelta, timezone
|
|
672
|
+
from kontra.config.loader import ContractLoader
|
|
673
|
+
from kontra.state.fingerprint import fingerprint_contract
|
|
674
|
+
from kontra.state.backends import get_default_store
|
|
675
|
+
|
|
676
|
+
# Validate that contract is a YAML file, not a data file (BUG-014)
|
|
677
|
+
_validate_contract_path(contract, "get_history")
|
|
678
|
+
|
|
679
|
+
# Load contract to get fingerprint
|
|
680
|
+
contract_obj = ContractLoader.from_path(contract)
|
|
681
|
+
fp = fingerprint_contract(contract_obj)
|
|
682
|
+
|
|
683
|
+
# Parse since parameter
|
|
684
|
+
since_dt = None
|
|
685
|
+
if since:
|
|
686
|
+
now = datetime.now(timezone.utc)
|
|
687
|
+
since_lower = since.lower().strip()
|
|
688
|
+
|
|
689
|
+
if since_lower.endswith("h"):
|
|
690
|
+
hours = int(since_lower[:-1])
|
|
691
|
+
since_dt = now - timedelta(hours=hours)
|
|
692
|
+
elif since_lower.endswith("d"):
|
|
693
|
+
days = int(since_lower[:-1])
|
|
694
|
+
since_dt = now - timedelta(days=days)
|
|
695
|
+
else:
|
|
696
|
+
# Try parsing as date
|
|
697
|
+
try:
|
|
698
|
+
since_dt = datetime.fromisoformat(since)
|
|
699
|
+
if since_dt.tzinfo is None:
|
|
700
|
+
since_dt = since_dt.replace(tzinfo=timezone.utc)
|
|
701
|
+
except ValueError:
|
|
702
|
+
raise ValueError(f"Invalid since format: {since}. Use '24h', '7d', or 'YYYY-MM-DD'")
|
|
703
|
+
|
|
704
|
+
# Get history from store
|
|
705
|
+
store = get_default_store()
|
|
706
|
+
if store is None:
|
|
707
|
+
return []
|
|
708
|
+
|
|
709
|
+
summaries = store.get_run_summaries(
|
|
710
|
+
contract_fingerprint=fp,
|
|
711
|
+
limit=limit,
|
|
712
|
+
since=since_dt,
|
|
713
|
+
failed_only=failed_only,
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
return [s.to_dict() for s in summaries]
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
# =============================================================================
|
|
720
|
+
# Deprecated Aliases (for backward compatibility)
|
|
721
|
+
# =============================================================================
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def scout(
|
|
725
|
+
data: Union[str, pl.DataFrame],
|
|
726
|
+
preset: str = "standard",
|
|
727
|
+
*,
|
|
728
|
+
columns: Optional[List[str]] = None,
|
|
729
|
+
sample: Optional[int] = None,
|
|
730
|
+
save: bool = True,
|
|
731
|
+
**kwargs,
|
|
732
|
+
) -> DatasetProfile:
|
|
733
|
+
"""
|
|
734
|
+
DEPRECATED: Use kontra.profile() instead.
|
|
735
|
+
|
|
736
|
+
Profile a dataset.
|
|
737
|
+
"""
|
|
738
|
+
import warnings
|
|
739
|
+
warnings.warn(
|
|
740
|
+
"kontra.scout() is deprecated, use kontra.profile() instead",
|
|
741
|
+
DeprecationWarning,
|
|
742
|
+
stacklevel=2,
|
|
743
|
+
)
|
|
744
|
+
return profile(data, preset=preset, columns=columns, sample=sample, save=save, **kwargs)
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def suggest_rules(
|
|
748
|
+
data: Union[str, DatasetProfile, pl.DataFrame],
|
|
749
|
+
min_confidence: float = 0.5,
|
|
750
|
+
) -> Suggestions:
|
|
751
|
+
"""
|
|
752
|
+
DEPRECATED: Use kontra.profile() then kontra.draft() instead.
|
|
753
|
+
|
|
754
|
+
Generate validation rule suggestions from data or a profile.
|
|
755
|
+
|
|
756
|
+
Args:
|
|
757
|
+
data: File path, DataFrame, or DatasetProfile
|
|
758
|
+
min_confidence: Minimum confidence score (0.0-1.0)
|
|
759
|
+
|
|
760
|
+
Returns:
|
|
761
|
+
Suggestions with .to_yaml(), .save(), .filter()
|
|
762
|
+
"""
|
|
763
|
+
import warnings
|
|
764
|
+
warnings.warn(
|
|
765
|
+
"kontra.suggest_rules() is deprecated, use kontra.profile() then kontra.draft() instead",
|
|
766
|
+
DeprecationWarning,
|
|
767
|
+
stacklevel=2,
|
|
768
|
+
)
|
|
769
|
+
# Handle different input types
|
|
770
|
+
if isinstance(data, DatasetProfile):
|
|
771
|
+
prof = data
|
|
772
|
+
elif isinstance(data, (str, pl.DataFrame)):
|
|
773
|
+
prof = profile(data, preset="scan")
|
|
774
|
+
else:
|
|
775
|
+
raise TypeError(
|
|
776
|
+
f"suggest_rules() expects str, DataFrame, or DatasetProfile, got {type(data).__name__}"
|
|
777
|
+
)
|
|
778
|
+
return draft(prof, min_confidence=min_confidence)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def explain(
|
|
782
|
+
data: Union[str, pl.DataFrame],
|
|
783
|
+
contract: str,
|
|
784
|
+
**kwargs,
|
|
785
|
+
) -> Dict[str, Any]:
|
|
786
|
+
"""
|
|
787
|
+
Show execution plan without running validation.
|
|
788
|
+
|
|
789
|
+
Args:
|
|
790
|
+
data: DataFrame or path/URI to data file
|
|
791
|
+
contract: Path to contract YAML file
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
Dict with preplan_rules, sql_rules, polars_rules, required_columns
|
|
795
|
+
|
|
796
|
+
Example:
|
|
797
|
+
plan = kontra.explain(df, "contract.yml")
|
|
798
|
+
print(f"Columns needed: {plan['required_columns']}")
|
|
799
|
+
for rule in plan['sql_rules']:
|
|
800
|
+
print(f"{rule['rule_id']}: {rule['sql']}")
|
|
801
|
+
"""
|
|
802
|
+
# For now, return basic plan info
|
|
803
|
+
# TODO: Implement full explain with SQL preview
|
|
804
|
+
from kontra.config.loader import ContractLoader
|
|
805
|
+
from kontra.rules.factory import RuleFactory
|
|
806
|
+
from kontra.rules.execution_plan import RuleExecutionPlan
|
|
807
|
+
|
|
808
|
+
contract_obj = ContractLoader.from_path(contract)
|
|
809
|
+
rules = RuleFactory(contract_obj.rules).build_rules()
|
|
810
|
+
plan = RuleExecutionPlan(rules)
|
|
811
|
+
compiled = plan.compile()
|
|
812
|
+
|
|
813
|
+
# sql_rules may be Rule objects or dicts depending on compilation
|
|
814
|
+
sql_rules_info = []
|
|
815
|
+
for r in compiled.sql_rules:
|
|
816
|
+
if hasattr(r, "rule_id"):
|
|
817
|
+
sql_rules_info.append({"rule_id": r.rule_id, "name": r.name})
|
|
818
|
+
elif isinstance(r, dict):
|
|
819
|
+
sql_rules_info.append({"rule_id": r.get("rule_id", ""), "name": r.get("name", "")})
|
|
820
|
+
|
|
821
|
+
return {
|
|
822
|
+
"required_columns": list(compiled.required_cols or []),
|
|
823
|
+
"total_rules": len(rules),
|
|
824
|
+
"predicates": len(compiled.predicates),
|
|
825
|
+
"fallback_rules": len(compiled.fallback_rules),
|
|
826
|
+
"sql_rules": sql_rules_info,
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
def diff(
|
|
831
|
+
contract: str,
|
|
832
|
+
*,
|
|
833
|
+
since: Optional[str] = None,
|
|
834
|
+
before: Optional[str] = None,
|
|
835
|
+
after: Optional[str] = None,
|
|
836
|
+
) -> Optional[Diff]:
|
|
837
|
+
"""
|
|
838
|
+
Compare validation runs over time.
|
|
839
|
+
|
|
840
|
+
Args:
|
|
841
|
+
contract: Contract name or path
|
|
842
|
+
since: Compare to run from this time ago ("7d", "24h", "2024-01-15")
|
|
843
|
+
before: Specific run ID for before state
|
|
844
|
+
after: Specific run ID for after state (default: latest)
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
Diff with .has_changes, .regressed, .new_failures, .to_llm()
|
|
848
|
+
Returns None if no history available
|
|
849
|
+
|
|
850
|
+
Example:
|
|
851
|
+
diff = kontra.diff("users_contract", since="7d")
|
|
852
|
+
if diff and diff.regressed:
|
|
853
|
+
print("Quality regressed!")
|
|
854
|
+
for failure in diff.new_failures:
|
|
855
|
+
print(f" NEW: {failure['rule_id']}")
|
|
856
|
+
"""
|
|
857
|
+
from kontra.state.backends import get_default_store
|
|
858
|
+
from kontra.state.types import StateDiff
|
|
859
|
+
from kontra.state.fingerprint import fingerprint_contract
|
|
860
|
+
from kontra.config.loader import ContractLoader
|
|
861
|
+
from kontra.errors import StateCorruptedError
|
|
862
|
+
|
|
863
|
+
store = get_default_store()
|
|
864
|
+
if store is None:
|
|
865
|
+
return None
|
|
866
|
+
|
|
867
|
+
# Validate that contract is a YAML file, not a data file (BUG-014)
|
|
868
|
+
if os.path.isfile(contract):
|
|
869
|
+
_validate_contract_path(contract, "diff")
|
|
870
|
+
|
|
871
|
+
# Resolve contract to fingerprint
|
|
872
|
+
try:
|
|
873
|
+
# If it's a file path, load contract and compute semantic fingerprint
|
|
874
|
+
if os.path.isfile(contract):
|
|
875
|
+
contract_obj = ContractLoader.from_path(contract)
|
|
876
|
+
contract_fp = fingerprint_contract(contract_obj)
|
|
877
|
+
else:
|
|
878
|
+
# Assume it's a contract name - search stored states
|
|
879
|
+
# Look through all contracts for matching name
|
|
880
|
+
contract_fp = None
|
|
881
|
+
for fp in store.list_contracts():
|
|
882
|
+
history = store.get_history(fp, limit=1)
|
|
883
|
+
if history and history[0].contract_name == contract:
|
|
884
|
+
contract_fp = fp
|
|
885
|
+
break
|
|
886
|
+
|
|
887
|
+
if contract_fp is None:
|
|
888
|
+
return None
|
|
889
|
+
|
|
890
|
+
# Get history for this contract
|
|
891
|
+
states = store.get_history(contract_fp, limit=100)
|
|
892
|
+
if len(states) < 2:
|
|
893
|
+
return None
|
|
894
|
+
|
|
895
|
+
# states are newest first, so [0] is latest, [1] is previous
|
|
896
|
+
after_state = states[0]
|
|
897
|
+
before_state = states[1]
|
|
898
|
+
|
|
899
|
+
# Compute diff
|
|
900
|
+
state_diff = StateDiff.compute(before_state, after_state)
|
|
901
|
+
return Diff.from_state_diff(state_diff)
|
|
902
|
+
|
|
903
|
+
except (json.JSONDecodeError, KeyError, TypeError, AttributeError) as e:
|
|
904
|
+
# These indicate corrupted state data
|
|
905
|
+
raise StateCorruptedError(contract, str(e))
|
|
906
|
+
except FileNotFoundError:
|
|
907
|
+
# No history available - this is normal
|
|
908
|
+
return None
|
|
909
|
+
except Exception as e:
|
|
910
|
+
# For other exceptions, log and re-raise as state corruption
|
|
911
|
+
# since we've already handled the "no history" case
|
|
912
|
+
log_exception(_logger, "Failed to compute diff", e)
|
|
913
|
+
raise StateCorruptedError(contract, str(e))
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def profile_diff(
|
|
917
|
+
source: str,
|
|
918
|
+
*,
|
|
919
|
+
since: Optional[str] = None,
|
|
920
|
+
) -> Optional[ProfileDiff]:
|
|
921
|
+
"""
|
|
922
|
+
Compare profile runs over time.
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
source: Data source path or name
|
|
926
|
+
since: Compare to profile from this time ago
|
|
927
|
+
|
|
928
|
+
Returns:
|
|
929
|
+
ProfileDiff with .has_changes, .schema_changes, .to_llm()
|
|
930
|
+
Returns None if no history available
|
|
931
|
+
|
|
932
|
+
Example:
|
|
933
|
+
diff = kontra.profile_diff("data.parquet", since="7d")
|
|
934
|
+
if diff and diff.has_schema_changes:
|
|
935
|
+
print("Schema changed!")
|
|
936
|
+
for col in diff.columns_added:
|
|
937
|
+
print(f" NEW: {col}")
|
|
938
|
+
"""
|
|
939
|
+
# TODO: Implement profile history lookup
|
|
940
|
+
return None
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def scout_diff(
|
|
944
|
+
source: str,
|
|
945
|
+
*,
|
|
946
|
+
since: Optional[str] = None,
|
|
947
|
+
) -> Optional[ProfileDiff]:
|
|
948
|
+
"""
|
|
949
|
+
DEPRECATED: Use kontra.profile_diff() instead.
|
|
950
|
+
|
|
951
|
+
Compare profile runs over time.
|
|
952
|
+
"""
|
|
953
|
+
import warnings
|
|
954
|
+
warnings.warn(
|
|
955
|
+
"kontra.scout_diff() is deprecated, use kontra.profile_diff() instead",
|
|
956
|
+
DeprecationWarning,
|
|
957
|
+
stacklevel=2,
|
|
958
|
+
)
|
|
959
|
+
return profile_diff(source, since=since)
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
# =============================================================================
|
|
963
|
+
# History Functions
|
|
964
|
+
# =============================================================================
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
def _resolve_contract_fingerprint(contract: str, store: Any, caller: str = "state function") -> Optional[str]:
|
|
968
|
+
"""
|
|
969
|
+
Resolve a contract name or path to its fingerprint.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
contract: Contract name or file path
|
|
973
|
+
store: State store instance
|
|
974
|
+
caller: Name of the calling function (for error messages)
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
Contract fingerprint or None if not found
|
|
978
|
+
"""
|
|
979
|
+
from kontra.state.fingerprint import fingerprint_contract
|
|
980
|
+
from kontra.config.loader import ContractLoader
|
|
981
|
+
|
|
982
|
+
# If it's a file path, load contract and compute semantic fingerprint
|
|
983
|
+
if os.path.isfile(contract):
|
|
984
|
+
# Validate that it's not a data file (BUG-014)
|
|
985
|
+
_validate_contract_path(contract, caller)
|
|
986
|
+
contract_obj = ContractLoader.from_path(contract)
|
|
987
|
+
return fingerprint_contract(contract_obj)
|
|
988
|
+
|
|
989
|
+
# Assume it's a contract name - search stored states
|
|
990
|
+
for fp in store.list_contracts():
|
|
991
|
+
history = store.get_history(fp, limit=1)
|
|
992
|
+
if history and history[0].contract_name == contract:
|
|
993
|
+
return fp
|
|
994
|
+
|
|
995
|
+
return None
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def list_runs(contract: str) -> List[Dict[str, Any]]:
|
|
999
|
+
"""
|
|
1000
|
+
List past validation runs for a contract.
|
|
1001
|
+
|
|
1002
|
+
Args:
|
|
1003
|
+
contract: Contract name or path
|
|
1004
|
+
|
|
1005
|
+
Returns:
|
|
1006
|
+
List of run summaries with id, timestamp, passed, etc.
|
|
1007
|
+
"""
|
|
1008
|
+
from kontra.state.backends import get_default_store
|
|
1009
|
+
|
|
1010
|
+
store = get_default_store()
|
|
1011
|
+
if store is None:
|
|
1012
|
+
return []
|
|
1013
|
+
|
|
1014
|
+
try:
|
|
1015
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "list_runs")
|
|
1016
|
+
if contract_fp is None:
|
|
1017
|
+
return []
|
|
1018
|
+
|
|
1019
|
+
states = store.get_history(contract_fp, limit=100)
|
|
1020
|
+
return [
|
|
1021
|
+
{
|
|
1022
|
+
"id": s.run_at.isoformat(),
|
|
1023
|
+
"fingerprint": s.contract_fingerprint,
|
|
1024
|
+
"timestamp": s.run_at,
|
|
1025
|
+
"passed": s.summary.passed,
|
|
1026
|
+
"total_rules": s.summary.total_rules,
|
|
1027
|
+
"failed_count": s.summary.failed_rules,
|
|
1028
|
+
"dataset": s.dataset_uri,
|
|
1029
|
+
}
|
|
1030
|
+
for s in states
|
|
1031
|
+
]
|
|
1032
|
+
except Exception as e:
|
|
1033
|
+
log_exception(_logger, "Failed to list runs", e)
|
|
1034
|
+
return []
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def get_run(
|
|
1038
|
+
contract: str,
|
|
1039
|
+
run_id: Optional[str] = None,
|
|
1040
|
+
) -> Optional[ValidationResult]:
|
|
1041
|
+
"""
|
|
1042
|
+
Get a specific validation run.
|
|
1043
|
+
|
|
1044
|
+
Args:
|
|
1045
|
+
contract: Contract name or path
|
|
1046
|
+
run_id: Specific run ID (default: latest)
|
|
1047
|
+
|
|
1048
|
+
Returns:
|
|
1049
|
+
ValidationResult or None if not found
|
|
1050
|
+
"""
|
|
1051
|
+
from kontra.state.backends import get_default_store
|
|
1052
|
+
|
|
1053
|
+
store = get_default_store()
|
|
1054
|
+
if store is None:
|
|
1055
|
+
return None
|
|
1056
|
+
|
|
1057
|
+
try:
|
|
1058
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "get_run")
|
|
1059
|
+
if contract_fp is None:
|
|
1060
|
+
return None
|
|
1061
|
+
|
|
1062
|
+
# Get history and find specific run or latest
|
|
1063
|
+
states = store.get_history(contract_fp, limit=100)
|
|
1064
|
+
if not states:
|
|
1065
|
+
return None
|
|
1066
|
+
|
|
1067
|
+
state = None
|
|
1068
|
+
if run_id:
|
|
1069
|
+
# Find specific run by timestamp ID
|
|
1070
|
+
for s in states:
|
|
1071
|
+
if s.run_at.isoformat() == run_id:
|
|
1072
|
+
state = s
|
|
1073
|
+
break
|
|
1074
|
+
else:
|
|
1075
|
+
# Get latest (first in list, newest first)
|
|
1076
|
+
state = states[0]
|
|
1077
|
+
|
|
1078
|
+
if state is None:
|
|
1079
|
+
return None
|
|
1080
|
+
|
|
1081
|
+
# Convert state to ValidationResult
|
|
1082
|
+
return ValidationResult(
|
|
1083
|
+
passed=state.summary.passed,
|
|
1084
|
+
dataset=state.dataset_uri,
|
|
1085
|
+
total_rows=state.summary.row_count or 0,
|
|
1086
|
+
total_rules=state.summary.total_rules,
|
|
1087
|
+
passed_count=state.summary.passed_rules,
|
|
1088
|
+
failed_count=state.summary.blocking_failures,
|
|
1089
|
+
warning_count=state.summary.warning_failures,
|
|
1090
|
+
rules=[
|
|
1091
|
+
RuleResult(
|
|
1092
|
+
rule_id=r.rule_id,
|
|
1093
|
+
name=r.rule_name,
|
|
1094
|
+
passed=r.passed,
|
|
1095
|
+
failed_count=r.failed_count,
|
|
1096
|
+
message=r.message or "",
|
|
1097
|
+
severity=r.severity,
|
|
1098
|
+
source=r.execution_source,
|
|
1099
|
+
column=r.column,
|
|
1100
|
+
)
|
|
1101
|
+
for r in state.rules
|
|
1102
|
+
],
|
|
1103
|
+
)
|
|
1104
|
+
except Exception as e:
|
|
1105
|
+
log_exception(_logger, "Failed to get run", e)
|
|
1106
|
+
return None
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
def has_runs(contract: str) -> bool:
|
|
1110
|
+
"""
|
|
1111
|
+
Check if any validation history exists for a contract.
|
|
1112
|
+
|
|
1113
|
+
Args:
|
|
1114
|
+
contract: Contract name or path
|
|
1115
|
+
|
|
1116
|
+
Returns:
|
|
1117
|
+
True if history exists
|
|
1118
|
+
"""
|
|
1119
|
+
from kontra.state.backends import get_default_store
|
|
1120
|
+
|
|
1121
|
+
store = get_default_store()
|
|
1122
|
+
if store is None:
|
|
1123
|
+
return False
|
|
1124
|
+
|
|
1125
|
+
try:
|
|
1126
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "has_runs")
|
|
1127
|
+
if contract_fp is None:
|
|
1128
|
+
return False
|
|
1129
|
+
|
|
1130
|
+
states = store.get_history(contract_fp, limit=1)
|
|
1131
|
+
return len(states) > 0
|
|
1132
|
+
except Exception as e:
|
|
1133
|
+
log_exception(_logger, "Failed to check runs", e)
|
|
1134
|
+
return False
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
def list_profiles(source: str) -> List[Dict[str, Any]]:
|
|
1138
|
+
"""
|
|
1139
|
+
List past profile runs for a data source.
|
|
1140
|
+
|
|
1141
|
+
Args:
|
|
1142
|
+
source: Data source path or name
|
|
1143
|
+
|
|
1144
|
+
Returns:
|
|
1145
|
+
List of profile summaries
|
|
1146
|
+
"""
|
|
1147
|
+
# TODO: Implement profile history
|
|
1148
|
+
return []
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
def get_profile(
|
|
1152
|
+
source: str,
|
|
1153
|
+
run_id: Optional[str] = None,
|
|
1154
|
+
) -> Optional[DatasetProfile]:
|
|
1155
|
+
"""
|
|
1156
|
+
Get a specific profile run.
|
|
1157
|
+
|
|
1158
|
+
Args:
|
|
1159
|
+
source: Data source path or name
|
|
1160
|
+
run_id: Specific run ID (default: latest)
|
|
1161
|
+
|
|
1162
|
+
Returns:
|
|
1163
|
+
DatasetProfile or None if not found
|
|
1164
|
+
"""
|
|
1165
|
+
# TODO: Implement profile history lookup
|
|
1166
|
+
return None
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
# =============================================================================
|
|
1170
|
+
# Configuration Functions
|
|
1171
|
+
# =============================================================================
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
def resolve(name: str) -> str:
|
|
1175
|
+
"""
|
|
1176
|
+
Resolve a datasource name to URI.
|
|
1177
|
+
|
|
1178
|
+
Args:
|
|
1179
|
+
name: Datasource name (e.g., "users" or "prod_db.users")
|
|
1180
|
+
|
|
1181
|
+
Returns:
|
|
1182
|
+
Resolved URI
|
|
1183
|
+
|
|
1184
|
+
Example:
|
|
1185
|
+
uri = kontra.resolve("users")
|
|
1186
|
+
uri = kontra.resolve("prod_db.users")
|
|
1187
|
+
"""
|
|
1188
|
+
return resolve_datasource(name)
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
def config(env: Optional[str] = None) -> KontraConfig:
|
|
1192
|
+
"""
|
|
1193
|
+
Get effective configuration.
|
|
1194
|
+
|
|
1195
|
+
Args:
|
|
1196
|
+
env: Environment name (default: use KONTRA_ENV or defaults)
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
KontraConfig with preplan, pushdown, etc.
|
|
1200
|
+
|
|
1201
|
+
Example:
|
|
1202
|
+
cfg = kontra.config()
|
|
1203
|
+
cfg = kontra.config(env="production")
|
|
1204
|
+
print(cfg.preplan) # "auto"
|
|
1205
|
+
"""
|
|
1206
|
+
return resolve_effective_config(env_name=env)
|
|
1207
|
+
|
|
1208
|
+
|
|
1209
|
+
# =============================================================================
|
|
1210
|
+
# Annotation Functions
|
|
1211
|
+
# =============================================================================
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
def annotate(
|
|
1215
|
+
contract: str,
|
|
1216
|
+
*,
|
|
1217
|
+
run_id: Optional[str] = None,
|
|
1218
|
+
rule_id: Optional[str] = None,
|
|
1219
|
+
actor_type: str = "agent",
|
|
1220
|
+
actor_id: str,
|
|
1221
|
+
annotation_type: str,
|
|
1222
|
+
summary: str,
|
|
1223
|
+
payload: Optional[Dict[str, Any]] = None,
|
|
1224
|
+
) -> int:
|
|
1225
|
+
"""
|
|
1226
|
+
Save an annotation on a validation run or specific rule.
|
|
1227
|
+
|
|
1228
|
+
Annotations provide "memory without authority" - agents and humans can
|
|
1229
|
+
record context about runs (resolutions, root causes, acknowledgments)
|
|
1230
|
+
without affecting Kontra's validation behavior.
|
|
1231
|
+
|
|
1232
|
+
Invariants:
|
|
1233
|
+
- Append-only: annotations are never updated or deleted
|
|
1234
|
+
- Uninterpreted: Kontra stores annotation_type but doesn't define vocabulary
|
|
1235
|
+
- Never read during validation or diff
|
|
1236
|
+
|
|
1237
|
+
Args:
|
|
1238
|
+
contract: Contract name or path
|
|
1239
|
+
run_id: Run ID to annotate (default: latest run).
|
|
1240
|
+
For file-based backends: string like "2024-01-15T09-30-00_abc123"
|
|
1241
|
+
For database backends: integer ID as string
|
|
1242
|
+
rule_id: Optional rule ID to annotate a specific rule
|
|
1243
|
+
actor_type: Who is creating the annotation ("agent" | "human" | "system")
|
|
1244
|
+
actor_id: Identifier for the actor (e.g., "repair-agent-v2", "alice@example.com")
|
|
1245
|
+
annotation_type: Type of annotation (e.g., "resolution", "root_cause", "acknowledged")
|
|
1246
|
+
summary: Human-readable summary
|
|
1247
|
+
payload: Optional structured data (dict)
|
|
1248
|
+
|
|
1249
|
+
Returns:
|
|
1250
|
+
Annotation ID (integer)
|
|
1251
|
+
|
|
1252
|
+
Raises:
|
|
1253
|
+
ValueError: If contract or run not found, or rule_id not found in run
|
|
1254
|
+
RuntimeError: If annotation save fails
|
|
1255
|
+
|
|
1256
|
+
Common annotation_type values (suggested, not enforced):
|
|
1257
|
+
- "resolution": I fixed this
|
|
1258
|
+
- "root_cause": This failed because...
|
|
1259
|
+
- "false_positive": This isn't actually a problem
|
|
1260
|
+
- "acknowledged": I saw this, will address later
|
|
1261
|
+
- "suppressed": Intentionally ignoring this
|
|
1262
|
+
- "note": General comment
|
|
1263
|
+
|
|
1264
|
+
Example:
|
|
1265
|
+
# Annotate the latest run for a contract
|
|
1266
|
+
kontra.annotate(
|
|
1267
|
+
"users_contract.yml",
|
|
1268
|
+
actor_type="agent",
|
|
1269
|
+
actor_id="repair-agent-v2",
|
|
1270
|
+
annotation_type="resolution",
|
|
1271
|
+
summary="Fixed null emails by backfilling from user_profiles table",
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
# Annotate a specific rule
|
|
1275
|
+
kontra.annotate(
|
|
1276
|
+
"users_contract.yml",
|
|
1277
|
+
rule_id="COL:email:not_null",
|
|
1278
|
+
actor_type="human",
|
|
1279
|
+
actor_id="alice@example.com",
|
|
1280
|
+
annotation_type="false_positive",
|
|
1281
|
+
summary="These are service accounts, nulls are expected",
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
# Annotate with structured payload
|
|
1285
|
+
kontra.annotate(
|
|
1286
|
+
"users_contract.yml",
|
|
1287
|
+
actor_type="agent",
|
|
1288
|
+
actor_id="analysis-agent",
|
|
1289
|
+
annotation_type="root_cause",
|
|
1290
|
+
summary="Upstream data source failed validation",
|
|
1291
|
+
payload={
|
|
1292
|
+
"upstream_source": "crm_export",
|
|
1293
|
+
"failure_time": "2024-01-15T08:30:00Z",
|
|
1294
|
+
"affected_rows": 1523,
|
|
1295
|
+
},
|
|
1296
|
+
)
|
|
1297
|
+
"""
|
|
1298
|
+
from kontra.state.backends import get_default_store
|
|
1299
|
+
from kontra.state.types import Annotation
|
|
1300
|
+
from kontra.state.fingerprint import fingerprint_contract
|
|
1301
|
+
from kontra.config.loader import ContractLoader
|
|
1302
|
+
|
|
1303
|
+
store = get_default_store()
|
|
1304
|
+
if store is None:
|
|
1305
|
+
raise RuntimeError("State store not available")
|
|
1306
|
+
|
|
1307
|
+
# Resolve contract to fingerprint
|
|
1308
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "annotate")
|
|
1309
|
+
if contract_fp is None:
|
|
1310
|
+
raise ValueError(f"Contract not found: {contract}")
|
|
1311
|
+
|
|
1312
|
+
# Get the run state
|
|
1313
|
+
if run_id is None:
|
|
1314
|
+
# Get latest run
|
|
1315
|
+
state = store.get_latest(contract_fp)
|
|
1316
|
+
if state is None:
|
|
1317
|
+
raise ValueError(f"No runs found for contract: {contract}")
|
|
1318
|
+
else:
|
|
1319
|
+
# Find specific run
|
|
1320
|
+
states = store.get_history(contract_fp, limit=100)
|
|
1321
|
+
state = None
|
|
1322
|
+
|
|
1323
|
+
# Try to match run_id as integer (database backends) or string timestamp
|
|
1324
|
+
for s in states:
|
|
1325
|
+
# Check run_at timestamp match
|
|
1326
|
+
if s.run_at.isoformat() == run_id:
|
|
1327
|
+
state = s
|
|
1328
|
+
break
|
|
1329
|
+
# Check ID match (for database backends)
|
|
1330
|
+
if s.id is not None and str(s.id) == run_id:
|
|
1331
|
+
state = s
|
|
1332
|
+
break
|
|
1333
|
+
|
|
1334
|
+
if state is None:
|
|
1335
|
+
raise ValueError(f"Run not found: {run_id}")
|
|
1336
|
+
|
|
1337
|
+
# If annotating a specific rule, find the rule_result_id
|
|
1338
|
+
rule_result_id = None
|
|
1339
|
+
if rule_id is not None:
|
|
1340
|
+
found = False
|
|
1341
|
+
for rule in state.rules:
|
|
1342
|
+
if rule.rule_id == rule_id:
|
|
1343
|
+
found = True
|
|
1344
|
+
rule_result_id = rule.id # May be None for file backends
|
|
1345
|
+
break
|
|
1346
|
+
|
|
1347
|
+
if not found:
|
|
1348
|
+
raise ValueError(f"Rule not found in run: {rule_id}")
|
|
1349
|
+
|
|
1350
|
+
# Create the annotation
|
|
1351
|
+
annotation = Annotation(
|
|
1352
|
+
run_id=state.id or 0,
|
|
1353
|
+
rule_result_id=rule_result_id,
|
|
1354
|
+
rule_id=rule_id, # Store semantic rule ID for cross-run queries
|
|
1355
|
+
actor_type=actor_type,
|
|
1356
|
+
actor_id=actor_id,
|
|
1357
|
+
annotation_type=annotation_type,
|
|
1358
|
+
summary=summary,
|
|
1359
|
+
payload=payload,
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1362
|
+
# Save annotation - method depends on backend type
|
|
1363
|
+
try:
|
|
1364
|
+
# For database backends, save_annotation works directly
|
|
1365
|
+
if hasattr(store, "save_annotation") and not isinstance(store, type):
|
|
1366
|
+
try:
|
|
1367
|
+
return store.save_annotation(annotation)
|
|
1368
|
+
except NotImplementedError:
|
|
1369
|
+
pass
|
|
1370
|
+
|
|
1371
|
+
# For file-based backends, need to find the run_id string
|
|
1372
|
+
if hasattr(store, "save_annotation_for_run"):
|
|
1373
|
+
# Find the run_id string by scanning the runs directory
|
|
1374
|
+
run_id_str = _find_run_id_string(store, contract_fp, state)
|
|
1375
|
+
if run_id_str is None:
|
|
1376
|
+
raise RuntimeError("Could not find run file for annotation")
|
|
1377
|
+
return store.save_annotation_for_run(contract_fp, run_id_str, annotation)
|
|
1378
|
+
|
|
1379
|
+
raise RuntimeError("Backend does not support annotations")
|
|
1380
|
+
|
|
1381
|
+
except Exception as e:
|
|
1382
|
+
raise RuntimeError(f"Failed to save annotation: {e}") from e
|
|
1383
|
+
|
|
1384
|
+
|
|
1385
|
+
def _find_run_id_string(store: Any, contract_fp: str, state: Any) -> Optional[str]:
|
|
1386
|
+
"""
|
|
1387
|
+
Find the run_id string for a state in file-based backends.
|
|
1388
|
+
|
|
1389
|
+
This is needed because file-based backends use string run IDs but
|
|
1390
|
+
ValidationState.id is an integer hash.
|
|
1391
|
+
"""
|
|
1392
|
+
from pathlib import Path
|
|
1393
|
+
|
|
1394
|
+
# LocalStore
|
|
1395
|
+
if hasattr(store, "_runs_dir"):
|
|
1396
|
+
runs_dir = store._runs_dir(contract_fp)
|
|
1397
|
+
if runs_dir.exists():
|
|
1398
|
+
for filepath in runs_dir.glob("*.json"):
|
|
1399
|
+
if filepath.name.endswith(".ann.jsonl"):
|
|
1400
|
+
continue
|
|
1401
|
+
loaded = store._load_state(filepath)
|
|
1402
|
+
if loaded and loaded.id == state.id:
|
|
1403
|
+
return filepath.stem
|
|
1404
|
+
return None
|
|
1405
|
+
|
|
1406
|
+
# S3Store - similar pattern but via fsspec
|
|
1407
|
+
if hasattr(store, "_runs_prefix") and hasattr(store, "_get_fs"):
|
|
1408
|
+
fs = store._get_fs()
|
|
1409
|
+
prefix = store._runs_prefix(contract_fp)
|
|
1410
|
+
try:
|
|
1411
|
+
all_files = fs.glob(f"s3://{prefix}/*.json")
|
|
1412
|
+
files = [f for f in all_files if not f.endswith(".ann.jsonl")]
|
|
1413
|
+
for filepath in files:
|
|
1414
|
+
loaded = store._load_state(filepath)
|
|
1415
|
+
if loaded and loaded.id == state.id:
|
|
1416
|
+
return filepath.rsplit("/", 1)[-1].replace(".json", "")
|
|
1417
|
+
except Exception:
|
|
1418
|
+
pass
|
|
1419
|
+
return None
|
|
1420
|
+
|
|
1421
|
+
return None
|
|
1422
|
+
|
|
1423
|
+
|
|
1424
|
+
def get_run_with_annotations(
|
|
1425
|
+
contract: str,
|
|
1426
|
+
run_id: Optional[str] = None,
|
|
1427
|
+
) -> Optional[ValidationResult]:
|
|
1428
|
+
"""
|
|
1429
|
+
Get a validation run with its annotations loaded.
|
|
1430
|
+
|
|
1431
|
+
By default, annotations are not loaded (they're opt-in for performance).
|
|
1432
|
+
Use this function when you need to see annotations.
|
|
1433
|
+
|
|
1434
|
+
Args:
|
|
1435
|
+
contract: Contract name or path
|
|
1436
|
+
run_id: Run ID (default: latest run)
|
|
1437
|
+
|
|
1438
|
+
Returns:
|
|
1439
|
+
ValidationResult with annotations, or None if not found
|
|
1440
|
+
|
|
1441
|
+
Example:
|
|
1442
|
+
result = kontra.get_run_with_annotations("users_contract.yml")
|
|
1443
|
+
if result:
|
|
1444
|
+
for rule in result.rules:
|
|
1445
|
+
print(f"{rule.rule_id}: {rule.annotations}")
|
|
1446
|
+
"""
|
|
1447
|
+
from kontra.state.backends import get_default_store
|
|
1448
|
+
|
|
1449
|
+
store = get_default_store()
|
|
1450
|
+
if store is None:
|
|
1451
|
+
return None
|
|
1452
|
+
|
|
1453
|
+
try:
|
|
1454
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "get_run_with_annotations")
|
|
1455
|
+
if contract_fp is None:
|
|
1456
|
+
return None
|
|
1457
|
+
|
|
1458
|
+
# Convert run_id string to integer if needed
|
|
1459
|
+
run_id_int = None
|
|
1460
|
+
if run_id is not None:
|
|
1461
|
+
try:
|
|
1462
|
+
run_id_int = int(run_id)
|
|
1463
|
+
except ValueError:
|
|
1464
|
+
# It's a timestamp or string ID - need to find the matching state
|
|
1465
|
+
states = store.get_history(contract_fp, limit=100)
|
|
1466
|
+
for s in states:
|
|
1467
|
+
if s.run_at.isoformat() == run_id:
|
|
1468
|
+
run_id_int = s.id
|
|
1469
|
+
break
|
|
1470
|
+
|
|
1471
|
+
state = store.get_run_with_annotations(contract_fp, run_id_int)
|
|
1472
|
+
if state is None:
|
|
1473
|
+
return None
|
|
1474
|
+
|
|
1475
|
+
# Convert to ValidationResult
|
|
1476
|
+
return ValidationResult(
|
|
1477
|
+
passed=state.summary.passed,
|
|
1478
|
+
dataset=state.dataset_uri,
|
|
1479
|
+
total_rows=state.summary.row_count or 0,
|
|
1480
|
+
total_rules=state.summary.total_rules,
|
|
1481
|
+
passed_count=state.summary.passed_rules,
|
|
1482
|
+
failed_count=state.summary.blocking_failures,
|
|
1483
|
+
warning_count=state.summary.warning_failures,
|
|
1484
|
+
rules=[
|
|
1485
|
+
RuleResult(
|
|
1486
|
+
rule_id=r.rule_id,
|
|
1487
|
+
name=r.rule_name,
|
|
1488
|
+
passed=r.passed,
|
|
1489
|
+
failed_count=r.failed_count,
|
|
1490
|
+
message=r.message or "",
|
|
1491
|
+
severity=r.severity,
|
|
1492
|
+
source=r.execution_source,
|
|
1493
|
+
column=r.column,
|
|
1494
|
+
annotations=[a.to_dict() for a in r.annotations] if r.annotations else None,
|
|
1495
|
+
)
|
|
1496
|
+
for r in state.rules
|
|
1497
|
+
],
|
|
1498
|
+
annotations=[a.to_dict() for a in state.annotations] if state.annotations else None,
|
|
1499
|
+
)
|
|
1500
|
+
except Exception as e:
|
|
1501
|
+
log_exception(_logger, "Failed to get run with annotations", e)
|
|
1502
|
+
return None
|
|
1503
|
+
|
|
1504
|
+
|
|
1505
|
+
def get_annotations(
|
|
1506
|
+
contract: str,
|
|
1507
|
+
*,
|
|
1508
|
+
rule_id: Optional[str] = None,
|
|
1509
|
+
annotation_type: Optional[str] = None,
|
|
1510
|
+
limit: int = 20,
|
|
1511
|
+
) -> List[Dict[str, Any]]:
|
|
1512
|
+
"""
|
|
1513
|
+
Retrieve annotations across runs for a contract.
|
|
1514
|
+
|
|
1515
|
+
Primary use case: Agent sees a failure, wants to check if past runs
|
|
1516
|
+
have hints about this rule. This provides cross-session memory.
|
|
1517
|
+
|
|
1518
|
+
Args:
|
|
1519
|
+
contract: Contract name or path
|
|
1520
|
+
rule_id: Filter to annotations on this rule (recommended)
|
|
1521
|
+
annotation_type: Filter by type (e.g., "resolution", "false_positive")
|
|
1522
|
+
limit: Max annotations to return (default 20)
|
|
1523
|
+
|
|
1524
|
+
Returns:
|
|
1525
|
+
List of annotation dicts, most recent first. Each dict contains:
|
|
1526
|
+
- id: Annotation ID
|
|
1527
|
+
- run_id: Which run this was attached to
|
|
1528
|
+
- rule_id: Semantic rule ID (e.g., "COL:email:not_null") or None for run-level
|
|
1529
|
+
- actor_type: "agent" | "human" | "system"
|
|
1530
|
+
- actor_id: Who created it
|
|
1531
|
+
- annotation_type: Type (e.g., "resolution", "root_cause")
|
|
1532
|
+
- summary: Human-readable summary
|
|
1533
|
+
- payload: Optional structured data
|
|
1534
|
+
- created_at: When it was created
|
|
1535
|
+
|
|
1536
|
+
Example:
|
|
1537
|
+
# Agent sees COL:email:not_null failing, checks for past hints
|
|
1538
|
+
hints = kontra.get_annotations(
|
|
1539
|
+
"users_contract.yml",
|
|
1540
|
+
rule_id="COL:email:not_null",
|
|
1541
|
+
)
|
|
1542
|
+
|
|
1543
|
+
for hint in hints:
|
|
1544
|
+
print(f"[{hint['annotation_type']}] {hint['summary']}")
|
|
1545
|
+
|
|
1546
|
+
# Get only resolutions
|
|
1547
|
+
resolutions = kontra.get_annotations(
|
|
1548
|
+
"users_contract.yml",
|
|
1549
|
+
rule_id="COL:email:not_null",
|
|
1550
|
+
annotation_type="resolution",
|
|
1551
|
+
)
|
|
1552
|
+
"""
|
|
1553
|
+
from kontra.state.backends import get_default_store
|
|
1554
|
+
|
|
1555
|
+
store = get_default_store()
|
|
1556
|
+
if store is None:
|
|
1557
|
+
return []
|
|
1558
|
+
|
|
1559
|
+
try:
|
|
1560
|
+
contract_fp = _resolve_contract_fingerprint(contract, store, "get_annotations")
|
|
1561
|
+
if contract_fp is None:
|
|
1562
|
+
return []
|
|
1563
|
+
|
|
1564
|
+
annotations = store.get_annotations_for_contract(
|
|
1565
|
+
contract_fp,
|
|
1566
|
+
rule_id=rule_id,
|
|
1567
|
+
annotation_type=annotation_type,
|
|
1568
|
+
limit=limit,
|
|
1569
|
+
)
|
|
1570
|
+
|
|
1571
|
+
return [a.to_dict() for a in annotations]
|
|
1572
|
+
except Exception as e:
|
|
1573
|
+
log_exception(_logger, "Failed to get annotations", e)
|
|
1574
|
+
return []
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
# =============================================================================
|
|
1578
|
+
# Service/Agent Support Functions
|
|
1579
|
+
# =============================================================================
|
|
1580
|
+
|
|
1581
|
+
# Global config path override for service/agent use
|
|
1582
|
+
_config_path_override: Optional[str] = None
|
|
1583
|
+
|
|
1584
|
+
|
|
1585
|
+
def set_config(path: Optional[str]) -> None:
|
|
1586
|
+
"""
|
|
1587
|
+
Set config file path for service/agent use.
|
|
1588
|
+
|
|
1589
|
+
By default, Kontra discovers config from cwd (.kontra/config.yml).
|
|
1590
|
+
For long-running services or agents, use this to set an explicit path.
|
|
1591
|
+
|
|
1592
|
+
Args:
|
|
1593
|
+
path: Path to config.yml (or None to reset to auto-discovery)
|
|
1594
|
+
|
|
1595
|
+
Example:
|
|
1596
|
+
kontra.set_config("/etc/kontra/config.yml")
|
|
1597
|
+
result = kontra.validate(df, rules=[...])
|
|
1598
|
+
|
|
1599
|
+
# Reset to default behavior
|
|
1600
|
+
kontra.set_config(None)
|
|
1601
|
+
"""
|
|
1602
|
+
global _config_path_override
|
|
1603
|
+
_config_path_override = path
|
|
1604
|
+
|
|
1605
|
+
|
|
1606
|
+
def get_config_path() -> Optional[str]:
|
|
1607
|
+
"""
|
|
1608
|
+
Get the current config path override.
|
|
1609
|
+
|
|
1610
|
+
Returns:
|
|
1611
|
+
The overridden config path, or None if using auto-discovery.
|
|
1612
|
+
"""
|
|
1613
|
+
return _config_path_override
|
|
1614
|
+
|
|
1615
|
+
|
|
1616
|
+
def list_rules() -> List[Dict[str, Any]]:
|
|
1617
|
+
"""
|
|
1618
|
+
List all available validation rules.
|
|
1619
|
+
|
|
1620
|
+
For agents and integrations that need to discover what rules exist.
|
|
1621
|
+
|
|
1622
|
+
Returns:
|
|
1623
|
+
List of rule info dicts with name, description, params
|
|
1624
|
+
|
|
1625
|
+
Example:
|
|
1626
|
+
rules = kontra.list_rules()
|
|
1627
|
+
for rule in rules:
|
|
1628
|
+
print(f"{rule['name']}: {rule['description']}")
|
|
1629
|
+
"""
|
|
1630
|
+
from kontra.rules.registry import RULE_REGISTRY
|
|
1631
|
+
|
|
1632
|
+
# Rule metadata - manually maintained for quality descriptions
|
|
1633
|
+
# This is better than parsing docstrings which may be inconsistent
|
|
1634
|
+
RULE_METADATA = {
|
|
1635
|
+
"not_null": {
|
|
1636
|
+
"description": "Fails where column contains NULL values (optionally NaN)",
|
|
1637
|
+
"params": {"column": "required", "include_nan": "optional (default: False)"},
|
|
1638
|
+
"scope": "column",
|
|
1639
|
+
},
|
|
1640
|
+
"unique": {
|
|
1641
|
+
"description": "Fails where column contains duplicate values",
|
|
1642
|
+
"params": {"column": "required"},
|
|
1643
|
+
"scope": "column",
|
|
1644
|
+
},
|
|
1645
|
+
"allowed_values": {
|
|
1646
|
+
"description": "Fails where column contains values not in allowed list",
|
|
1647
|
+
"params": {"column": "required", "values": "required (list)"},
|
|
1648
|
+
"scope": "column",
|
|
1649
|
+
},
|
|
1650
|
+
"disallowed_values": {
|
|
1651
|
+
"description": "Fails where column contains values that ARE in the disallowed list",
|
|
1652
|
+
"params": {"column": "required", "values": "required (list)"},
|
|
1653
|
+
"scope": "column",
|
|
1654
|
+
},
|
|
1655
|
+
"range": {
|
|
1656
|
+
"description": "Fails where column values are outside [min, max] range",
|
|
1657
|
+
"params": {"column": "required", "min": "optional", "max": "optional"},
|
|
1658
|
+
"scope": "column",
|
|
1659
|
+
},
|
|
1660
|
+
"length": {
|
|
1661
|
+
"description": "Fails where string length is outside [min, max] bounds",
|
|
1662
|
+
"params": {"column": "required", "min": "optional", "max": "optional"},
|
|
1663
|
+
"scope": "column",
|
|
1664
|
+
},
|
|
1665
|
+
"regex": {
|
|
1666
|
+
"description": "Fails where column values don't match regex pattern",
|
|
1667
|
+
"params": {"column": "required", "pattern": "required"},
|
|
1668
|
+
"scope": "column",
|
|
1669
|
+
},
|
|
1670
|
+
"contains": {
|
|
1671
|
+
"description": "Fails where column values don't contain the substring",
|
|
1672
|
+
"params": {"column": "required", "substring": "required"},
|
|
1673
|
+
"scope": "column",
|
|
1674
|
+
},
|
|
1675
|
+
"starts_with": {
|
|
1676
|
+
"description": "Fails where column values don't start with the prefix",
|
|
1677
|
+
"params": {"column": "required", "prefix": "required"},
|
|
1678
|
+
"scope": "column",
|
|
1679
|
+
},
|
|
1680
|
+
"ends_with": {
|
|
1681
|
+
"description": "Fails where column values don't end with the suffix",
|
|
1682
|
+
"params": {"column": "required", "suffix": "required"},
|
|
1683
|
+
"scope": "column",
|
|
1684
|
+
},
|
|
1685
|
+
"dtype": {
|
|
1686
|
+
"description": "Fails if column data type doesn't match expected type",
|
|
1687
|
+
"params": {"column": "required", "type": "required"},
|
|
1688
|
+
"scope": "column",
|
|
1689
|
+
},
|
|
1690
|
+
"min_rows": {
|
|
1691
|
+
"description": "Fails if dataset has fewer than threshold rows",
|
|
1692
|
+
"params": {"threshold": "required (int)"},
|
|
1693
|
+
"scope": "dataset",
|
|
1694
|
+
},
|
|
1695
|
+
"max_rows": {
|
|
1696
|
+
"description": "Fails if dataset has more than threshold rows",
|
|
1697
|
+
"params": {"threshold": "required (int)"},
|
|
1698
|
+
"scope": "dataset",
|
|
1699
|
+
},
|
|
1700
|
+
"freshness": {
|
|
1701
|
+
"description": "Fails if timestamp column is older than max_age",
|
|
1702
|
+
"params": {"column": "required", "max_age": "required (e.g., '24h', '7d')"},
|
|
1703
|
+
"scope": "column",
|
|
1704
|
+
},
|
|
1705
|
+
"custom_sql_check": {
|
|
1706
|
+
"description": "Escape hatch: run arbitrary SQL that returns violation count",
|
|
1707
|
+
"params": {"sql": "required", "threshold": "optional (default: 0)"},
|
|
1708
|
+
"scope": "dataset",
|
|
1709
|
+
},
|
|
1710
|
+
"compare": {
|
|
1711
|
+
"description": "Fails where left column doesn't satisfy comparison with right column",
|
|
1712
|
+
"params": {
|
|
1713
|
+
"left": "required (column name)",
|
|
1714
|
+
"right": "required (column name)",
|
|
1715
|
+
"op": "required (>, >=, <, <=, ==, !=)",
|
|
1716
|
+
},
|
|
1717
|
+
"scope": "cross-column",
|
|
1718
|
+
},
|
|
1719
|
+
"conditional_not_null": {
|
|
1720
|
+
"description": "Fails where column is NULL when a condition is met",
|
|
1721
|
+
"params": {
|
|
1722
|
+
"column": "required (column to check)",
|
|
1723
|
+
"when": "required (e.g., \"status == 'shipped'\")",
|
|
1724
|
+
},
|
|
1725
|
+
"scope": "cross-column",
|
|
1726
|
+
},
|
|
1727
|
+
"conditional_range": {
|
|
1728
|
+
"description": "Fails where column is outside range when a condition is met",
|
|
1729
|
+
"params": {
|
|
1730
|
+
"column": "required (column to check)",
|
|
1731
|
+
"when": "required (e.g., \"customer_type == 'premium'\")",
|
|
1732
|
+
"min": "optional (minimum value, inclusive)",
|
|
1733
|
+
"max": "optional (maximum value, inclusive)",
|
|
1734
|
+
},
|
|
1735
|
+
"scope": "cross-column",
|
|
1736
|
+
},
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1739
|
+
result = []
|
|
1740
|
+
for name in sorted(RULE_REGISTRY.keys()):
|
|
1741
|
+
info = {"name": name}
|
|
1742
|
+
|
|
1743
|
+
# Add metadata if available
|
|
1744
|
+
if name in RULE_METADATA:
|
|
1745
|
+
meta = RULE_METADATA[name]
|
|
1746
|
+
info["description"] = meta.get("description", "")
|
|
1747
|
+
info["params"] = meta.get("params", {})
|
|
1748
|
+
info["scope"] = meta.get("scope", "unknown")
|
|
1749
|
+
else:
|
|
1750
|
+
# Fallback for rules not in metadata
|
|
1751
|
+
info["description"] = f"Validation rule: {name}"
|
|
1752
|
+
info["params"] = {}
|
|
1753
|
+
info["scope"] = "unknown"
|
|
1754
|
+
|
|
1755
|
+
result.append(info)
|
|
1756
|
+
|
|
1757
|
+
return result
|
|
1758
|
+
|
|
1759
|
+
|
|
1760
|
+
def health() -> Dict[str, Any]:
|
|
1761
|
+
"""
|
|
1762
|
+
Health check for service/agent use.
|
|
1763
|
+
|
|
1764
|
+
Returns version, config status, and available rules.
|
|
1765
|
+
Use this to verify Kontra is properly installed and configured.
|
|
1766
|
+
|
|
1767
|
+
Returns:
|
|
1768
|
+
Dict with version, config_found, config_path, rule_count, status
|
|
1769
|
+
|
|
1770
|
+
Example:
|
|
1771
|
+
health = kontra.health()
|
|
1772
|
+
if health["status"] == "ok":
|
|
1773
|
+
print(f"Kontra {health['version']} ready")
|
|
1774
|
+
else:
|
|
1775
|
+
print(f"Issue: {health['status']}")
|
|
1776
|
+
"""
|
|
1777
|
+
from kontra.rules.registry import RULE_REGISTRY
|
|
1778
|
+
from kontra.config.settings import find_config_file
|
|
1779
|
+
from pathlib import Path
|
|
1780
|
+
|
|
1781
|
+
result: Dict[str, Any] = {
|
|
1782
|
+
"version": __version__,
|
|
1783
|
+
"status": "ok",
|
|
1784
|
+
}
|
|
1785
|
+
|
|
1786
|
+
# Check config
|
|
1787
|
+
if _config_path_override:
|
|
1788
|
+
config_path = Path(_config_path_override)
|
|
1789
|
+
result["config_path"] = str(config_path)
|
|
1790
|
+
result["config_found"] = config_path.exists()
|
|
1791
|
+
if not config_path.exists():
|
|
1792
|
+
result["status"] = "config_not_found"
|
|
1793
|
+
else:
|
|
1794
|
+
found = find_config_file()
|
|
1795
|
+
result["config_path"] = str(found) if found else None
|
|
1796
|
+
result["config_found"] = found is not None
|
|
1797
|
+
|
|
1798
|
+
# Rule count
|
|
1799
|
+
result["rule_count"] = len(RULE_REGISTRY)
|
|
1800
|
+
|
|
1801
|
+
# List available rules
|
|
1802
|
+
result["rules"] = sorted(RULE_REGISTRY.keys())
|
|
1803
|
+
|
|
1804
|
+
return result
|
|
1805
|
+
|
|
1806
|
+
|
|
1807
|
+
# =============================================================================
|
|
1808
|
+
# Exports
|
|
1809
|
+
# =============================================================================
|
|
1810
|
+
|
|
1811
|
+
__all__ = [
|
|
1812
|
+
# Version
|
|
1813
|
+
"__version__",
|
|
1814
|
+
# Core functions
|
|
1815
|
+
"validate",
|
|
1816
|
+
"profile",
|
|
1817
|
+
"draft",
|
|
1818
|
+
"explain",
|
|
1819
|
+
"diff",
|
|
1820
|
+
"profile_diff",
|
|
1821
|
+
# Transformation probes
|
|
1822
|
+
"compare",
|
|
1823
|
+
"profile_relationship",
|
|
1824
|
+
# Deprecated aliases (kept for backward compatibility)
|
|
1825
|
+
"scout", # Use profile() instead
|
|
1826
|
+
"suggest_rules", # Use draft() instead
|
|
1827
|
+
"scout_diff", # Use profile_diff() instead
|
|
1828
|
+
# History functions
|
|
1829
|
+
"list_runs",
|
|
1830
|
+
"get_run",
|
|
1831
|
+
"has_runs",
|
|
1832
|
+
"list_profiles",
|
|
1833
|
+
"get_profile",
|
|
1834
|
+
# Annotation functions
|
|
1835
|
+
"annotate",
|
|
1836
|
+
"get_annotations",
|
|
1837
|
+
"get_run_with_annotations",
|
|
1838
|
+
# Configuration functions
|
|
1839
|
+
"resolve",
|
|
1840
|
+
"config",
|
|
1841
|
+
"list_datasources",
|
|
1842
|
+
# Service/Agent support
|
|
1843
|
+
"set_config",
|
|
1844
|
+
"get_config_path",
|
|
1845
|
+
"list_rules",
|
|
1846
|
+
"health",
|
|
1847
|
+
# Result types
|
|
1848
|
+
"ValidationResult",
|
|
1849
|
+
"RuleResult",
|
|
1850
|
+
"DryRunResult",
|
|
1851
|
+
"Diff",
|
|
1852
|
+
"Suggestions",
|
|
1853
|
+
"SuggestedRule",
|
|
1854
|
+
"DatasetProfile",
|
|
1855
|
+
"ColumnProfile",
|
|
1856
|
+
"ProfileDiff",
|
|
1857
|
+
# Probe result types
|
|
1858
|
+
"CompareResult",
|
|
1859
|
+
"RelationshipProfile",
|
|
1860
|
+
# Rules helpers
|
|
1861
|
+
"rules",
|
|
1862
|
+
# Decorators
|
|
1863
|
+
"validate_decorator",
|
|
1864
|
+
# Errors
|
|
1865
|
+
"ValidationError",
|
|
1866
|
+
"StateCorruptedError",
|
|
1867
|
+
# Advanced usage
|
|
1868
|
+
"ValidationEngine",
|
|
1869
|
+
"ScoutProfiler",
|
|
1870
|
+
"KontraConfig",
|
|
1871
|
+
]
|