kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/rules/factory.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
|
|
5
|
+
from kontra.rules.base import BaseRule
|
|
6
|
+
from kontra.rules.registry import get_rule, get_all_rule_names
|
|
7
|
+
from kontra.config.models import RuleSpec
|
|
8
|
+
from kontra.errors import DuplicateRuleIdError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _derive_rule_id(spec: RuleSpec) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Generate a stable, unique rule_id for a rule spec when no explicit id is provided.
|
|
14
|
+
|
|
15
|
+
Policy:
|
|
16
|
+
- If spec.id is set → return it as-is (caller must ensure uniqueness)
|
|
17
|
+
- If column param exists and is a string → COL:{column}:{name}
|
|
18
|
+
- Otherwise → DATASET:{name}
|
|
19
|
+
"""
|
|
20
|
+
explicit: Optional[str] = getattr(spec, "id", None)
|
|
21
|
+
if explicit:
|
|
22
|
+
return explicit
|
|
23
|
+
|
|
24
|
+
params: Dict[str, Any] = spec.params or {}
|
|
25
|
+
col = params.get("column")
|
|
26
|
+
if isinstance(col, str) and col:
|
|
27
|
+
return f"COL:{col}:{spec.name}"
|
|
28
|
+
return f"DATASET:{spec.name}"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RuleFactory:
|
|
32
|
+
"""
|
|
33
|
+
Translate contract RuleSpec objects into instantiated Rule instances.
|
|
34
|
+
|
|
35
|
+
Responsibilities:
|
|
36
|
+
- Resolve the rule class from the registry
|
|
37
|
+
- Instantiate with (name, params)
|
|
38
|
+
- Assign rule_id per our identity policy
|
|
39
|
+
- Provide helpful errors on unknown/failed rules
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, rule_specs: List[RuleSpec]):
|
|
43
|
+
self.rule_specs = rule_specs
|
|
44
|
+
|
|
45
|
+
def build_rules(self) -> List[BaseRule]:
|
|
46
|
+
"""Instantiate all rules declared in the contract."""
|
|
47
|
+
rules: List[BaseRule] = []
|
|
48
|
+
seen_ids: Dict[str, int] = {} # rule_id -> index in rule_specs (for error messages)
|
|
49
|
+
|
|
50
|
+
for idx, spec in enumerate(self.rule_specs):
|
|
51
|
+
rule_name = spec.name
|
|
52
|
+
rule_params = spec.params or {}
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
rule_cls = get_rule(rule_name)
|
|
56
|
+
except KeyError:
|
|
57
|
+
available = sorted(get_all_rule_names())
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Unknown rule '{rule_name}'. "
|
|
60
|
+
f"Available rules: {', '.join(available)}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# IMPORTANT: constructor accepts (name, params) only
|
|
65
|
+
rule_instance: BaseRule = rule_cls(rule_name, rule_params)
|
|
66
|
+
# Assign rule_id after construction
|
|
67
|
+
rule_id = _derive_rule_id(spec)
|
|
68
|
+
|
|
69
|
+
# Check for duplicate rule IDs
|
|
70
|
+
if rule_id in seen_ids:
|
|
71
|
+
prev_idx = seen_ids[rule_id]
|
|
72
|
+
column = rule_params.get("column")
|
|
73
|
+
raise DuplicateRuleIdError(
|
|
74
|
+
rule_id=rule_id,
|
|
75
|
+
rule_name=rule_name,
|
|
76
|
+
rule_index=idx,
|
|
77
|
+
conflict_index=prev_idx,
|
|
78
|
+
column=column if isinstance(column, str) else None,
|
|
79
|
+
)
|
|
80
|
+
seen_ids[rule_id] = idx
|
|
81
|
+
|
|
82
|
+
rule_instance.rule_id = rule_id
|
|
83
|
+
rule_instance.severity = spec.severity
|
|
84
|
+
rule_instance.context = spec.context or {}
|
|
85
|
+
rules.append(rule_instance)
|
|
86
|
+
except (ValueError, DuplicateRuleIdError):
|
|
87
|
+
raise # Re-raise validation errors as-is
|
|
88
|
+
except Exception as e:
|
|
89
|
+
raise RuntimeError(f"Failed to instantiate rule '{rule_name}': {e}") from e
|
|
90
|
+
|
|
91
|
+
return rules
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def summarize_rules(rules: List[BaseRule]) -> List[Dict[str, Any]]:
|
|
95
|
+
"""Return a summary of all rule configurations (for debug/reporting)."""
|
|
96
|
+
return [
|
|
97
|
+
{
|
|
98
|
+
"rule_id": getattr(rule, "rule_id", rule.name),
|
|
99
|
+
"params": rule.params,
|
|
100
|
+
"class": rule.__class__.__name__,
|
|
101
|
+
}
|
|
102
|
+
for rule in rules
|
|
103
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# src/contra/rules/planner/predicates.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Set
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class Predicate:
|
|
10
|
+
"""
|
|
11
|
+
A vectorized rule failure mask.
|
|
12
|
+
|
|
13
|
+
rule_id : str
|
|
14
|
+
Stable identifier for the rule instance.
|
|
15
|
+
expr : pl.Expr
|
|
16
|
+
Boolean expression; True for rows that FAIL the rule.
|
|
17
|
+
message : str
|
|
18
|
+
Deterministic, human-readable message when the rule fails.
|
|
19
|
+
columns : set[str]
|
|
20
|
+
Column names referenced by `expr` (used for column pruning).
|
|
21
|
+
"""
|
|
22
|
+
rule_id: str
|
|
23
|
+
expr: pl.Expr
|
|
24
|
+
message: str
|
|
25
|
+
columns: Set[str]
|
kontra/rules/registry.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# src/contra/rules/registry.py
|
|
2
|
+
from typing import Dict, Type
|
|
3
|
+
from kontra.rules.base import BaseRule
|
|
4
|
+
|
|
5
|
+
RULE_REGISTRY: Dict[str, Type[BaseRule]] = {}
|
|
6
|
+
|
|
7
|
+
def register_rule(name: str):
|
|
8
|
+
"""Decorator to register rule classes in the global registry."""
|
|
9
|
+
def decorator(cls: Type[BaseRule]):
|
|
10
|
+
RULE_REGISTRY[name] = cls
|
|
11
|
+
cls.rule_key = name
|
|
12
|
+
return cls
|
|
13
|
+
return decorator
|
|
14
|
+
|
|
15
|
+
def get_rule(name: str) -> Type[BaseRule]:
|
|
16
|
+
"""Retrieves a rule class by name."""
|
|
17
|
+
if name not in RULE_REGISTRY:
|
|
18
|
+
raise KeyError(f"Rule '{name}' not found in registry.")
|
|
19
|
+
return RULE_REGISTRY[name]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_all_rule_names() -> set:
|
|
23
|
+
"""Returns all registered rule names."""
|
|
24
|
+
return set(RULE_REGISTRY.keys())
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# src/kontra/rules/static_predicates.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Tuple
|
|
5
|
+
|
|
6
|
+
from kontra.rules.base import BaseRule
|
|
7
|
+
|
|
8
|
+
# (rule_id, column, op, value) -- op ∈ ALLOWED_OPS
|
|
9
|
+
PredicateT = Tuple[str, str, str, Any]
|
|
10
|
+
ALLOWED_OPS = {"==", "!=", ">=", ">", "<=", "<", "^=", "not_null"}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize(pairs: Iterable[PredicateT]) -> List[PredicateT]:
|
|
14
|
+
"""Validate and normalize a stream of preplan predicates."""
|
|
15
|
+
out: List[PredicateT] = []
|
|
16
|
+
seen: set[Tuple[str, str, str, Any]] = set()
|
|
17
|
+
for rid, col, op, val in pairs:
|
|
18
|
+
if not isinstance(rid, str) or not rid:
|
|
19
|
+
continue
|
|
20
|
+
if not isinstance(col, str) or not col:
|
|
21
|
+
continue
|
|
22
|
+
if op not in ALLOWED_OPS:
|
|
23
|
+
continue
|
|
24
|
+
key = (rid, col, op, val)
|
|
25
|
+
if key in seen:
|
|
26
|
+
continue
|
|
27
|
+
seen.add(key)
|
|
28
|
+
out.append((rid, col, op, val))
|
|
29
|
+
return out
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _from_rule_hook(rule: BaseRule) -> List[PredicateT]:
|
|
33
|
+
"""Ask the rule itself (if it implements the optional hook)."""
|
|
34
|
+
fn = getattr(rule, "to_preplan_predicates", None)
|
|
35
|
+
if callable(fn):
|
|
36
|
+
try:
|
|
37
|
+
preds = fn() or []
|
|
38
|
+
except Exception:
|
|
39
|
+
preds = []
|
|
40
|
+
# Ensure each tuple starts with this rule's rule_id
|
|
41
|
+
fixed: List[PredicateT] = []
|
|
42
|
+
for item in preds:
|
|
43
|
+
if not isinstance(item, tuple) or len(item) != 4:
|
|
44
|
+
continue
|
|
45
|
+
rid, col, op, val = item
|
|
46
|
+
# Allow rule to omit rid; fill it in
|
|
47
|
+
if not isinstance(rid, str) or not rid:
|
|
48
|
+
rid = getattr(rule, "rule_id", getattr(rule, "name", ""))
|
|
49
|
+
fixed.append((rid, col, op, val))
|
|
50
|
+
return fixed
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _conservative_builtin_mapping(rule: BaseRule) -> List[PredicateT]:
|
|
55
|
+
"""
|
|
56
|
+
Optional mapping for known built-ins, so you don't have to add hooks yet.
|
|
57
|
+
Keep this conservative and obvious (no regex engines etc.).
|
|
58
|
+
"""
|
|
59
|
+
name = getattr(rule, "name", "")
|
|
60
|
+
params: Dict[str, Any] = getattr(rule, "params", {}) or {}
|
|
61
|
+
rid = getattr(rule, "rule_id", name)
|
|
62
|
+
|
|
63
|
+
out: List[PredicateT] = []
|
|
64
|
+
|
|
65
|
+
# not_null(column)
|
|
66
|
+
if name.endswith("not_null"):
|
|
67
|
+
col = params.get("column")
|
|
68
|
+
if isinstance(col, str) and col:
|
|
69
|
+
out.append((rid, col, "not_null", True))
|
|
70
|
+
|
|
71
|
+
# equals / allowed_values (single value)
|
|
72
|
+
if name in {"equals", "allowed_values"}:
|
|
73
|
+
col = params.get("column")
|
|
74
|
+
val = params.get("value", None)
|
|
75
|
+
if val is None:
|
|
76
|
+
vals = params.get("values")
|
|
77
|
+
if isinstance(vals, (list, tuple)) and len(vals) == 1:
|
|
78
|
+
val = vals[0]
|
|
79
|
+
if isinstance(col, str) and col and isinstance(val, (str, int, float)):
|
|
80
|
+
out.append((rid, col, "==", val))
|
|
81
|
+
|
|
82
|
+
# min / max / range style (very conservative)
|
|
83
|
+
if name in {"gte", "min_value", "min"}:
|
|
84
|
+
col = params.get("column"); v = params.get("value")
|
|
85
|
+
if isinstance(col, str) and col and v is not None:
|
|
86
|
+
out.append((rid, col, ">=", v))
|
|
87
|
+
if name in {"lte", "max_value", "max"}:
|
|
88
|
+
col = params.get("column"); v = params.get("value")
|
|
89
|
+
if isinstance(col, str) and col and v is not None:
|
|
90
|
+
out.append((rid, col, "<=", v))
|
|
91
|
+
|
|
92
|
+
# regex("^prefix") → prefix
|
|
93
|
+
if name == "regex":
|
|
94
|
+
col = params.get("column")
|
|
95
|
+
pat = params.get("pattern", "")
|
|
96
|
+
if isinstance(col, str) and col and isinstance(pat, str) and pat.startswith("^"):
|
|
97
|
+
# only allow pure-prefix (no special chars beyond anchors)
|
|
98
|
+
body = pat[1:]
|
|
99
|
+
if body and all(ch.isalnum() or ch in {"_", "-", ".", "@"} for ch in body):
|
|
100
|
+
out.append((rid, col, "^=", body))
|
|
101
|
+
|
|
102
|
+
return out
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def extract_static_predicates_from_rules(rules: List[BaseRule]) -> List[PredicateT]:
|
|
106
|
+
"""
|
|
107
|
+
Preferred entry point: pass the ORIGINAL rule instances (from RuleFactory).
|
|
108
|
+
We ask each rule for an optional hook, then apply a conservative builtin mapping.
|
|
109
|
+
"""
|
|
110
|
+
pairs: List[PredicateT] = []
|
|
111
|
+
for r in rules:
|
|
112
|
+
pairs.extend(_from_rule_hook(r))
|
|
113
|
+
pairs.extend(_conservative_builtin_mapping(r))
|
|
114
|
+
return _normalize(pairs)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Backward-compatible shim if you still want a function named extract_static_predicates
|
|
118
|
+
# and you have access to the original rules alongside the compiled plan.
|
|
119
|
+
def extract_static_predicates(*, rules: List[BaseRule]) -> List[PredicateT]:
|
|
120
|
+
return extract_static_predicates_from_rules(rules)
|
kontra/scout/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# src/kontra/scout/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Kontra Scout - Contract-free data profiling for LLM context compression.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from kontra.scout.profiler import ScoutProfiler
|
|
7
|
+
from kontra.scout.types import ColumnProfile, DatasetProfile
|
|
8
|
+
|
|
9
|
+
__all__ = ["ScoutProfiler", "ColumnProfile", "DatasetProfile"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# src/kontra/scout/backends/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Scout profiler backends - pluggable data source adapters.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import ProfilerBackend
|
|
7
|
+
from .duckdb_backend import DuckDBBackend
|
|
8
|
+
|
|
9
|
+
__all__ = ["ProfilerBackend", "DuckDBBackend"]
|
|
10
|
+
|
|
11
|
+
# PostgreSQL backend (optional - requires psycopg)
|
|
12
|
+
try:
|
|
13
|
+
from .postgres_backend import PostgreSQLBackend
|
|
14
|
+
|
|
15
|
+
__all__.append("PostgreSQLBackend")
|
|
16
|
+
except ImportError:
|
|
17
|
+
pass
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# src/kontra/scout/backends/base.py
|
|
2
|
+
"""
|
|
3
|
+
ProfilerBackend protocol - abstract interface for Scout data source adapters.
|
|
4
|
+
|
|
5
|
+
Each backend implements SQL-based profiling for a specific data source type.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional, Protocol, Tuple
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProfilerBackend(Protocol):
|
|
14
|
+
"""
|
|
15
|
+
Protocol for Scout profiler backends.
|
|
16
|
+
|
|
17
|
+
A backend provides methods to:
|
|
18
|
+
- Connect to the data source
|
|
19
|
+
- Get schema information
|
|
20
|
+
- Execute aggregation queries
|
|
21
|
+
- Fetch values for low-cardinality columns
|
|
22
|
+
|
|
23
|
+
Implementations:
|
|
24
|
+
- DuckDBBackend: Parquet, CSV (local + S3)
|
|
25
|
+
- PostgreSQLBackend: PostgreSQL tables
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def connect(self) -> None:
|
|
29
|
+
"""Establish connection to the data source."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def close(self) -> None:
|
|
33
|
+
"""Close the connection and clean up resources."""
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
def get_schema(self) -> List[Tuple[str, str]]:
|
|
37
|
+
"""
|
|
38
|
+
Return schema as [(column_name, raw_type), ...].
|
|
39
|
+
|
|
40
|
+
The raw_type is the native type string from the data source.
|
|
41
|
+
"""
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
def get_row_count(self) -> int:
|
|
45
|
+
"""Return total row count (may use metadata optimization)."""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def get_estimated_size_bytes(self) -> Optional[int]:
|
|
49
|
+
"""Return estimated size in bytes (if available)."""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def execute_stats_query(self, exprs: List[str]) -> Dict[str, Any]:
|
|
53
|
+
"""
|
|
54
|
+
Execute a single aggregation query with multiple expressions.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
exprs: List of SQL expressions like "COUNT(*) AS total"
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Dict mapping column aliases to values.
|
|
61
|
+
"""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
def fetch_top_values(
|
|
65
|
+
self, column: str, limit: int
|
|
66
|
+
) -> List[Tuple[Any, int]]:
|
|
67
|
+
"""
|
|
68
|
+
Fetch top N most frequent values for a column.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
column: Column name
|
|
72
|
+
limit: Maximum number of values to return
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of (value, count) tuples ordered by count descending.
|
|
76
|
+
"""
|
|
77
|
+
...
|
|
78
|
+
|
|
79
|
+
def fetch_distinct_values(self, column: str) -> List[Any]:
|
|
80
|
+
"""
|
|
81
|
+
Fetch all distinct values for a low-cardinality column.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
column: Column name
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of distinct values, ordered.
|
|
88
|
+
"""
|
|
89
|
+
...
|
|
90
|
+
|
|
91
|
+
def fetch_sample_values(self, column: str, limit: int) -> List[Any]:
|
|
92
|
+
"""
|
|
93
|
+
Fetch a sample of values for pattern detection.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
column: Column name
|
|
97
|
+
limit: Maximum number of values to return
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of sample values.
|
|
101
|
+
"""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
def esc_ident(self, name: str) -> str:
|
|
105
|
+
"""Escape an identifier (column/table name) for this backend's SQL dialect."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def source_format(self) -> str:
|
|
110
|
+
"""Return the source format identifier (e.g., 'parquet', 'csv', 'postgres')."""
|
|
111
|
+
...
|