kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Dict, Any, Optional
4
+
5
+ from kontra.rules.base import BaseRule
6
+ from kontra.rules.registry import get_rule, get_all_rule_names
7
+ from kontra.config.models import RuleSpec
8
+ from kontra.errors import DuplicateRuleIdError
9
+
10
+
11
+ def _derive_rule_id(spec: RuleSpec) -> str:
12
+ """
13
+ Generate a stable, unique rule_id for a rule spec when no explicit id is provided.
14
+
15
+ Policy:
16
+ - If spec.id is set → return it as-is (caller must ensure uniqueness)
17
+ - If column param exists and is a string → COL:{column}:{name}
18
+ - Otherwise → DATASET:{name}
19
+ """
20
+ explicit: Optional[str] = getattr(spec, "id", None)
21
+ if explicit:
22
+ return explicit
23
+
24
+ params: Dict[str, Any] = spec.params or {}
25
+ col = params.get("column")
26
+ if isinstance(col, str) and col:
27
+ return f"COL:{col}:{spec.name}"
28
+ return f"DATASET:{spec.name}"
29
+
30
+
31
+ class RuleFactory:
32
+ """
33
+ Translate contract RuleSpec objects into instantiated Rule instances.
34
+
35
+ Responsibilities:
36
+ - Resolve the rule class from the registry
37
+ - Instantiate with (name, params)
38
+ - Assign rule_id per our identity policy
39
+ - Provide helpful errors on unknown/failed rules
40
+ """
41
+
42
+ def __init__(self, rule_specs: List[RuleSpec]):
43
+ self.rule_specs = rule_specs
44
+
45
+ def build_rules(self) -> List[BaseRule]:
46
+ """Instantiate all rules declared in the contract."""
47
+ rules: List[BaseRule] = []
48
+ seen_ids: Dict[str, int] = {} # rule_id -> index in rule_specs (for error messages)
49
+
50
+ for idx, spec in enumerate(self.rule_specs):
51
+ rule_name = spec.name
52
+ rule_params = spec.params or {}
53
+
54
+ try:
55
+ rule_cls = get_rule(rule_name)
56
+ except KeyError:
57
+ available = sorted(get_all_rule_names())
58
+ raise ValueError(
59
+ f"Unknown rule '{rule_name}'. "
60
+ f"Available rules: {', '.join(available)}"
61
+ )
62
+
63
+ try:
64
+ # IMPORTANT: constructor accepts (name, params) only
65
+ rule_instance: BaseRule = rule_cls(rule_name, rule_params)
66
+ # Assign rule_id after construction
67
+ rule_id = _derive_rule_id(spec)
68
+
69
+ # Check for duplicate rule IDs
70
+ if rule_id in seen_ids:
71
+ prev_idx = seen_ids[rule_id]
72
+ column = rule_params.get("column")
73
+ raise DuplicateRuleIdError(
74
+ rule_id=rule_id,
75
+ rule_name=rule_name,
76
+ rule_index=idx,
77
+ conflict_index=prev_idx,
78
+ column=column if isinstance(column, str) else None,
79
+ )
80
+ seen_ids[rule_id] = idx
81
+
82
+ rule_instance.rule_id = rule_id
83
+ rule_instance.severity = spec.severity
84
+ rule_instance.context = spec.context or {}
85
+ rules.append(rule_instance)
86
+ except (ValueError, DuplicateRuleIdError):
87
+ raise # Re-raise validation errors as-is
88
+ except Exception as e:
89
+ raise RuntimeError(f"Failed to instantiate rule '{rule_name}': {e}") from e
90
+
91
+ return rules
92
+
93
+ @staticmethod
94
+ def summarize_rules(rules: List[BaseRule]) -> List[Dict[str, Any]]:
95
+ """Return a summary of all rule configurations (for debug/reporting)."""
96
+ return [
97
+ {
98
+ "rule_id": getattr(rule, "rule_id", rule.name),
99
+ "params": rule.params,
100
+ "class": rule.__class__.__name__,
101
+ }
102
+ for rule in rules
103
+ ]
@@ -0,0 +1,25 @@
1
+ # src/contra/rules/planner/predicates.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Set
6
+ import polars as pl
7
+
8
+ @dataclass(frozen=True)
9
+ class Predicate:
10
+ """
11
+ A vectorized rule failure mask.
12
+
13
+ rule_id : str
14
+ Stable identifier for the rule instance.
15
+ expr : pl.Expr
16
+ Boolean expression; True for rows that FAIL the rule.
17
+ message : str
18
+ Deterministic, human-readable message when the rule fails.
19
+ columns : set[str]
20
+ Column names referenced by `expr` (used for column pruning).
21
+ """
22
+ rule_id: str
23
+ expr: pl.Expr
24
+ message: str
25
+ columns: Set[str]
@@ -0,0 +1,24 @@
1
+ # src/contra/rules/registry.py
2
+ from typing import Dict, Type
3
+ from kontra.rules.base import BaseRule
4
+
5
+ RULE_REGISTRY: Dict[str, Type[BaseRule]] = {}
6
+
7
+ def register_rule(name: str):
8
+ """Decorator to register rule classes in the global registry."""
9
+ def decorator(cls: Type[BaseRule]):
10
+ RULE_REGISTRY[name] = cls
11
+ cls.rule_key = name
12
+ return cls
13
+ return decorator
14
+
15
+ def get_rule(name: str) -> Type[BaseRule]:
16
+ """Retrieves a rule class by name."""
17
+ if name not in RULE_REGISTRY:
18
+ raise KeyError(f"Rule '{name}' not found in registry.")
19
+ return RULE_REGISTRY[name]
20
+
21
+
22
+ def get_all_rule_names() -> set:
23
+ """Returns all registered rule names."""
24
+ return set(RULE_REGISTRY.keys())
@@ -0,0 +1,120 @@
1
+ # src/kontra/rules/static_predicates.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Dict, Iterable, List, Tuple
5
+
6
+ from kontra.rules.base import BaseRule
7
+
8
+ # (rule_id, column, op, value) -- op ∈ ALLOWED_OPS
9
+ PredicateT = Tuple[str, str, str, Any]
10
+ ALLOWED_OPS = {"==", "!=", ">=", ">", "<=", "<", "^=", "not_null"}
11
+
12
+
13
+ def _normalize(pairs: Iterable[PredicateT]) -> List[PredicateT]:
14
+ """Validate and normalize a stream of preplan predicates."""
15
+ out: List[PredicateT] = []
16
+ seen: set[Tuple[str, str, str, Any]] = set()
17
+ for rid, col, op, val in pairs:
18
+ if not isinstance(rid, str) or not rid:
19
+ continue
20
+ if not isinstance(col, str) or not col:
21
+ continue
22
+ if op not in ALLOWED_OPS:
23
+ continue
24
+ key = (rid, col, op, val)
25
+ if key in seen:
26
+ continue
27
+ seen.add(key)
28
+ out.append((rid, col, op, val))
29
+ return out
30
+
31
+
32
+ def _from_rule_hook(rule: BaseRule) -> List[PredicateT]:
33
+ """Ask the rule itself (if it implements the optional hook)."""
34
+ fn = getattr(rule, "to_preplan_predicates", None)
35
+ if callable(fn):
36
+ try:
37
+ preds = fn() or []
38
+ except Exception:
39
+ preds = []
40
+ # Ensure each tuple starts with this rule's rule_id
41
+ fixed: List[PredicateT] = []
42
+ for item in preds:
43
+ if not isinstance(item, tuple) or len(item) != 4:
44
+ continue
45
+ rid, col, op, val = item
46
+ # Allow rule to omit rid; fill it in
47
+ if not isinstance(rid, str) or not rid:
48
+ rid = getattr(rule, "rule_id", getattr(rule, "name", ""))
49
+ fixed.append((rid, col, op, val))
50
+ return fixed
51
+ return []
52
+
53
+
54
+ def _conservative_builtin_mapping(rule: BaseRule) -> List[PredicateT]:
55
+ """
56
+ Optional mapping for known built-ins, so you don't have to add hooks yet.
57
+ Keep this conservative and obvious (no regex engines etc.).
58
+ """
59
+ name = getattr(rule, "name", "")
60
+ params: Dict[str, Any] = getattr(rule, "params", {}) or {}
61
+ rid = getattr(rule, "rule_id", name)
62
+
63
+ out: List[PredicateT] = []
64
+
65
+ # not_null(column)
66
+ if name.endswith("not_null"):
67
+ col = params.get("column")
68
+ if isinstance(col, str) and col:
69
+ out.append((rid, col, "not_null", True))
70
+
71
+ # equals / allowed_values (single value)
72
+ if name in {"equals", "allowed_values"}:
73
+ col = params.get("column")
74
+ val = params.get("value", None)
75
+ if val is None:
76
+ vals = params.get("values")
77
+ if isinstance(vals, (list, tuple)) and len(vals) == 1:
78
+ val = vals[0]
79
+ if isinstance(col, str) and col and isinstance(val, (str, int, float)):
80
+ out.append((rid, col, "==", val))
81
+
82
+ # min / max / range style (very conservative)
83
+ if name in {"gte", "min_value", "min"}:
84
+ col = params.get("column"); v = params.get("value")
85
+ if isinstance(col, str) and col and v is not None:
86
+ out.append((rid, col, ">=", v))
87
+ if name in {"lte", "max_value", "max"}:
88
+ col = params.get("column"); v = params.get("value")
89
+ if isinstance(col, str) and col and v is not None:
90
+ out.append((rid, col, "<=", v))
91
+
92
+ # regex("^prefix") → prefix
93
+ if name == "regex":
94
+ col = params.get("column")
95
+ pat = params.get("pattern", "")
96
+ if isinstance(col, str) and col and isinstance(pat, str) and pat.startswith("^"):
97
+ # only allow pure-prefix (no special chars beyond anchors)
98
+ body = pat[1:]
99
+ if body and all(ch.isalnum() or ch in {"_", "-", ".", "@"} for ch in body):
100
+ out.append((rid, col, "^=", body))
101
+
102
+ return out
103
+
104
+
105
+ def extract_static_predicates_from_rules(rules: List[BaseRule]) -> List[PredicateT]:
106
+ """
107
+ Preferred entry point: pass the ORIGINAL rule instances (from RuleFactory).
108
+ We ask each rule for an optional hook, then apply a conservative builtin mapping.
109
+ """
110
+ pairs: List[PredicateT] = []
111
+ for r in rules:
112
+ pairs.extend(_from_rule_hook(r))
113
+ pairs.extend(_conservative_builtin_mapping(r))
114
+ return _normalize(pairs)
115
+
116
+
117
+ # Backward-compatible shim if you still want a function named extract_static_predicates
118
+ # and you have access to the original rules alongside the compiled plan.
119
+ def extract_static_predicates(*, rules: List[BaseRule]) -> List[PredicateT]:
120
+ return extract_static_predicates_from_rules(rules)
@@ -0,0 +1,9 @@
1
+ # src/kontra/scout/__init__.py
2
+ """
3
+ Kontra Scout - Contract-free data profiling for LLM context compression.
4
+ """
5
+
6
+ from kontra.scout.profiler import ScoutProfiler
7
+ from kontra.scout.types import ColumnProfile, DatasetProfile
8
+
9
+ __all__ = ["ScoutProfiler", "ColumnProfile", "DatasetProfile"]
@@ -0,0 +1,17 @@
1
+ # src/kontra/scout/backends/__init__.py
2
+ """
3
+ Scout profiler backends - pluggable data source adapters.
4
+ """
5
+
6
+ from .base import ProfilerBackend
7
+ from .duckdb_backend import DuckDBBackend
8
+
9
+ __all__ = ["ProfilerBackend", "DuckDBBackend"]
10
+
11
+ # PostgreSQL backend (optional - requires psycopg)
12
+ try:
13
+ from .postgres_backend import PostgreSQLBackend
14
+
15
+ __all__.append("PostgreSQLBackend")
16
+ except ImportError:
17
+ pass
@@ -0,0 +1,111 @@
1
+ # src/kontra/scout/backends/base.py
2
+ """
3
+ ProfilerBackend protocol - abstract interface for Scout data source adapters.
4
+
5
+ Each backend implements SQL-based profiling for a specific data source type.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional, Protocol, Tuple
11
+
12
+
13
+ class ProfilerBackend(Protocol):
14
+ """
15
+ Protocol for Scout profiler backends.
16
+
17
+ A backend provides methods to:
18
+ - Connect to the data source
19
+ - Get schema information
20
+ - Execute aggregation queries
21
+ - Fetch values for low-cardinality columns
22
+
23
+ Implementations:
24
+ - DuckDBBackend: Parquet, CSV (local + S3)
25
+ - PostgreSQLBackend: PostgreSQL tables
26
+ """
27
+
28
+ def connect(self) -> None:
29
+ """Establish connection to the data source."""
30
+ ...
31
+
32
+ def close(self) -> None:
33
+ """Close the connection and clean up resources."""
34
+ ...
35
+
36
+ def get_schema(self) -> List[Tuple[str, str]]:
37
+ """
38
+ Return schema as [(column_name, raw_type), ...].
39
+
40
+ The raw_type is the native type string from the data source.
41
+ """
42
+ ...
43
+
44
+ def get_row_count(self) -> int:
45
+ """Return total row count (may use metadata optimization)."""
46
+ ...
47
+
48
+ def get_estimated_size_bytes(self) -> Optional[int]:
49
+ """Return estimated size in bytes (if available)."""
50
+ ...
51
+
52
+ def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
53
+ """
54
+ Execute a single aggregation query with multiple expressions.
55
+
56
+ Args:
57
+ exprs: List of SQL expressions like "COUNT(*) AS total"
58
+
59
+ Returns:
60
+ Dict mapping column aliases to values.
61
+ """
62
+ ...
63
+
64
+ def fetch_top_values(
65
+ self, column: str, limit: int
66
+ ) -> List[Tuple[Any, int]]:
67
+ """
68
+ Fetch top N most frequent values for a column.
69
+
70
+ Args:
71
+ column: Column name
72
+ limit: Maximum number of values to return
73
+
74
+ Returns:
75
+ List of (value, count) tuples ordered by count descending.
76
+ """
77
+ ...
78
+
79
+ def fetch_distinct_values(self, column: str) -> List[Any]:
80
+ """
81
+ Fetch all distinct values for a low-cardinality column.
82
+
83
+ Args:
84
+ column: Column name
85
+
86
+ Returns:
87
+ List of distinct values, ordered.
88
+ """
89
+ ...
90
+
91
+ def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
92
+ """
93
+ Fetch a sample of values for pattern detection.
94
+
95
+ Args:
96
+ column: Column name
97
+ limit: Maximum number of values to return
98
+
99
+ Returns:
100
+ List of sample values.
101
+ """
102
+ ...
103
+
104
+ def esc_ident(self, name: str) -> str:
105
+ """Escape an identifier (column/table name) for this backend's SQL dialect."""
106
+ ...
107
+
108
+ @property
109
+ def source_format(self) -> str:
110
+ """Return the source format identifier (e.g., 'parquet', 'csv', 'postgres')."""
111
+ ...