PyPI - kontra - Versions diffs - 0.5.2__py3-none-any.whl - Mend

kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

kontra/__init__.py +1871 -0
kontra/api/__init__.py +22 -0
kontra/api/compare.py +340 -0
kontra/api/decorators.py +153 -0
kontra/api/results.py +2121 -0
kontra/api/rules.py +681 -0
kontra/cli/__init__.py +0 -0
kontra/cli/commands/__init__.py +1 -0
kontra/cli/commands/config.py +153 -0
kontra/cli/commands/diff.py +450 -0
kontra/cli/commands/history.py +196 -0
kontra/cli/commands/profile.py +289 -0
kontra/cli/commands/validate.py +468 -0
kontra/cli/constants.py +6 -0
kontra/cli/main.py +48 -0
kontra/cli/renderers.py +304 -0
kontra/cli/utils.py +28 -0
kontra/config/__init__.py +34 -0
kontra/config/loader.py +127 -0
kontra/config/models.py +49 -0
kontra/config/settings.py +797 -0
kontra/connectors/__init__.py +0 -0
kontra/connectors/db_utils.py +251 -0
kontra/connectors/detection.py +323 -0
kontra/connectors/handle.py +368 -0
kontra/connectors/postgres.py +127 -0
kontra/connectors/sqlserver.py +226 -0
kontra/engine/__init__.py +0 -0
kontra/engine/backends/duckdb_session.py +227 -0
kontra/engine/backends/duckdb_utils.py +18 -0
kontra/engine/backends/polars_backend.py +47 -0
kontra/engine/engine.py +1205 -0
kontra/engine/executors/__init__.py +15 -0
kontra/engine/executors/base.py +50 -0
kontra/engine/executors/database_base.py +528 -0
kontra/engine/executors/duckdb_sql.py +607 -0
kontra/engine/executors/postgres_sql.py +162 -0
kontra/engine/executors/registry.py +69 -0
kontra/engine/executors/sqlserver_sql.py +163 -0
kontra/engine/materializers/__init__.py +14 -0
kontra/engine/materializers/base.py +42 -0
kontra/engine/materializers/duckdb.py +110 -0
kontra/engine/materializers/factory.py +22 -0
kontra/engine/materializers/polars_connector.py +131 -0
kontra/engine/materializers/postgres.py +157 -0
kontra/engine/materializers/registry.py +138 -0
kontra/engine/materializers/sqlserver.py +160 -0
kontra/engine/result.py +15 -0
kontra/engine/sql_utils.py +611 -0
kontra/engine/sql_validator.py +609 -0
kontra/engine/stats.py +194 -0
kontra/engine/types.py +138 -0
kontra/errors.py +533 -0
kontra/logging.py +85 -0
kontra/preplan/__init__.py +5 -0
kontra/preplan/planner.py +253 -0
kontra/preplan/postgres.py +179 -0
kontra/preplan/sqlserver.py +191 -0
kontra/preplan/types.py +24 -0
kontra/probes/__init__.py +20 -0
kontra/probes/compare.py +400 -0
kontra/probes/relationship.py +283 -0
kontra/reporters/__init__.py +0 -0
kontra/reporters/json_reporter.py +190 -0
kontra/reporters/rich_reporter.py +11 -0
kontra/rules/__init__.py +35 -0
kontra/rules/base.py +186 -0
kontra/rules/builtin/__init__.py +40 -0
kontra/rules/builtin/allowed_values.py +156 -0
kontra/rules/builtin/compare.py +188 -0
kontra/rules/builtin/conditional_not_null.py +213 -0
kontra/rules/builtin/conditional_range.py +310 -0
kontra/rules/builtin/contains.py +138 -0
kontra/rules/builtin/custom_sql_check.py +182 -0
kontra/rules/builtin/disallowed_values.py +140 -0
kontra/rules/builtin/dtype.py +203 -0
kontra/rules/builtin/ends_with.py +129 -0
kontra/rules/builtin/freshness.py +240 -0
kontra/rules/builtin/length.py +193 -0
kontra/rules/builtin/max_rows.py +35 -0
kontra/rules/builtin/min_rows.py +46 -0
kontra/rules/builtin/not_null.py +121 -0
kontra/rules/builtin/range.py +222 -0
kontra/rules/builtin/regex.py +143 -0
kontra/rules/builtin/starts_with.py +129 -0
kontra/rules/builtin/unique.py +124 -0
kontra/rules/condition_parser.py +203 -0
kontra/rules/execution_plan.py +455 -0
kontra/rules/factory.py +103 -0
kontra/rules/predicates.py +25 -0
kontra/rules/registry.py +24 -0
kontra/rules/static_predicates.py +120 -0
kontra/scout/__init__.py +9 -0
kontra/scout/backends/__init__.py +17 -0
kontra/scout/backends/base.py +111 -0
kontra/scout/backends/duckdb_backend.py +359 -0
kontra/scout/backends/postgres_backend.py +519 -0
kontra/scout/backends/sqlserver_backend.py +577 -0
kontra/scout/dtype_mapping.py +150 -0
kontra/scout/patterns.py +69 -0
kontra/scout/profiler.py +801 -0
kontra/scout/reporters/__init__.py +39 -0
kontra/scout/reporters/json_reporter.py +165 -0
kontra/scout/reporters/markdown_reporter.py +152 -0
kontra/scout/reporters/rich_reporter.py +144 -0
kontra/scout/store.py +208 -0
kontra/scout/suggest.py +200 -0
kontra/scout/types.py +652 -0
kontra/state/__init__.py +29 -0
kontra/state/backends/__init__.py +79 -0
kontra/state/backends/base.py +348 -0
kontra/state/backends/local.py +480 -0
kontra/state/backends/postgres.py +1010 -0
kontra/state/backends/s3.py +543 -0
kontra/state/backends/sqlserver.py +969 -0
kontra/state/fingerprint.py +166 -0
kontra/state/types.py +1061 -0
kontra/version.py +1 -0
kontra-0.5.2.dist-info/METADATA +122 -0
kontra-0.5.2.dist-info/RECORD +124 -0
kontra-0.5.2.dist-info/WHEEL +5 -0
kontra-0.5.2.dist-info/entry_points.txt +2 -0
kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
kontra-0.5.2.dist-info/top_level.txt +1 -0

kontra/scout/reporters/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+# src/kontra/scout/reporters/__init__.py
+"""
+Kontra Scout reporters for different output formats.
+"""
+from typing import Literal
+from kontra.scout.types import DatasetProfile
+from .json_reporter import render_json, render_llm
+from .markdown_reporter import render_markdown
+from .rich_reporter import render_rich
+def render_profile(
+    profile: DatasetProfile,
+    format: Literal["rich", "json", "markdown", "llm"] = "rich",
+) -> str:
+    """
+    Render a DatasetProfile to the specified format.
+    Args:
+        profile: The DatasetProfile to render
+        format: Output format ("rich", "json", "markdown", "llm")
+    Returns:
+        Formatted string output
+    """
+    if format == "json":
+        return render_json(profile)
+    elif format == "markdown":
+        return render_markdown(profile)
+    elif format == "llm":
+        return render_llm(profile)
+    else:
+        return render_rich(profile)
+__all__ = ["render_profile", "render_json", "render_markdown", "render_rich", "render_llm"]

kontra/scout/reporters/json_reporter.py ADDED Viewed

@@ -0,0 +1,165 @@
+# src/kontra/scout/reporters/json_reporter.py
+"""
+JSON reporter for Kontra Scout - optimized for LLM consumption.
+"""
+from __future__ import annotations
+import json
+from typing import Any, Dict
+from kontra.scout.types import DatasetProfile
+def render_json(profile: DatasetProfile, indent: int = 2) -> str:
+    """
+    Render a DatasetProfile as JSON.
+    Args:
+        profile: The DatasetProfile to render
+        indent: JSON indentation (default: 2)
+    Returns:
+        JSON string
+    """
+    payload = profile.to_dict()
+    return json.dumps(payload, indent=indent, default=str, ensure_ascii=False)
+def build_compact_json(profile: DatasetProfile) -> Dict[str, Any]:
+    """
+    Build a compact JSON representation optimized for LLM context.
+    Omits null/empty fields for minimal token usage.
+    """
+    d = profile.to_dict()
+    return _strip_nulls(d)
+def _strip_nulls(obj: Any) -> Any:
+    """Recursively remove None values and empty lists/dicts."""
+    if isinstance(obj, dict):
+        return {
+            k: _strip_nulls(v)
+            for k, v in obj.items()
+            if v is not None and v != [] and v != {}
+        }
+    elif isinstance(obj, list):
+        return [_strip_nulls(item) for item in obj if item is not None]
+    return obj
+def render_llm(profile: DatasetProfile) -> str:
+    """
+    Render a DatasetProfile in token-optimized format for LLM context.
+    Design goals:
+    - Minimal tokens while preserving signal
+    - Easy for LLM to parse and reason about
+    - Key info: schema, null rates, cardinality, semantic types
+    - Actionable: enough info to infer validation rules
+    Format:
+    ```
+    # Dataset: source_uri
+    rows=N cols=N
+    ## Columns
+    col_name: type | nulls=N% | distinct=N | semantic_type
+      values: [val1, val2, ...] or top: val1(N%), val2(N%)
+    ```
+    """
+    lines = []
+    # Header
+    lines.append(f"# Dataset: {profile.source_uri}")
+    lines.append(f"rows={profile.row_count:,} cols={profile.column_count}")
+    lines.append("")
+    lines.append("## Columns")
+    for col in profile.columns:
+        # Main column line: name: type | nulls | distinct | semantic
+        parts = [col.dtype]
+        # Null rate (only if > 0)
+        if col.null_rate > 0:
+            null_pct = col.null_rate * 100
+            if null_pct < 0.1:
+                parts.append("nulls=<0.1%")
+            else:
+                parts.append(f"nulls={null_pct:.1f}%")
+        # Distinct count with uniqueness hint
+        if col.uniqueness_ratio >= 0.99 and col.distinct_count > 100:
+            parts.append(f"distinct={col.distinct_count:,} (unique)")
+        elif col.distinct_count <= 20:
+            parts.append(f"distinct={col.distinct_count}")
+        else:
+            parts.append(f"distinct={col.distinct_count:,}")
+        # Semantic type
+        if col.semantic_type:
+            parts.append(col.semantic_type)
+        # Pattern detection
+        if col.detected_patterns:
+            parts.append(f"pattern:{col.detected_patterns[0]}")
+        lines.append(f"{col.name}: {' | '.join(parts)}")
+        # Values line (if low cardinality or has top values)
+        if col.values and col.is_low_cardinality:
+            # All values for low cardinality
+            vals_str = ", ".join(repr(v) for v in col.values[:10])
+            if len(col.values) > 10:
+                vals_str += f", ... ({len(col.values)} total)"
+            lines.append(f"  values: [{vals_str}]")
+        elif col.top_values:
+            # Top values with percentages
+            top_parts = []
+            for tv in col.top_values[:5]:
+                val_repr = repr(tv.value) if isinstance(tv.value, str) else str(tv.value)
+                top_parts.append(f"{val_repr}({tv.pct:.0f}%)")
+            lines.append(f"  top: {', '.join(top_parts)}")
+        # Temporal range (useful for freshness rules)
+        if col.temporal and (col.temporal.date_min or col.temporal.date_max):
+            date_range = f"{col.temporal.date_min or '?'} to {col.temporal.date_max or '?'}"
+            lines.append(f"  range: {date_range}")
+    # Footer with quick stats
+    lines.append("")
+    lines.append("## Summary")
+    # Count column types
+    type_counts: Dict[str, int] = {}
+    for col in profile.columns:
+        t = col.dtype
+        type_counts[t] = type_counts.get(t, 0) + 1
+    type_summary = ", ".join(f"{t}:{n}" for t, n in sorted(type_counts.items()))
+    lines.append(f"types: {type_summary}")
+    # Identify potential issues
+    issues = []
+    for col in profile.columns:
+        if col.null_rate > 0.1:  # >10% nulls
+            issues.append(f"{col.name}:{col.null_rate*100:.0f}%null")
+    if issues:
+        lines.append(f"high_nulls: {', '.join(issues[:5])}")
+    # Identify unique columns (likely identifiers)
+    unique_cols = [
+        col.name for col in profile.columns
+        if col.uniqueness_ratio >= 0.99 and col.distinct_count > 100
+    ]
+    if unique_cols:
+        lines.append(f"likely_ids: {', '.join(unique_cols[:5])}")
+    # Identify categorical columns
+    categorical = [
+        col.name for col in profile.columns
+        if col.is_low_cardinality or col.semantic_type == "category"
+    ]
+    if categorical:
+        lines.append(f"categorical: {', '.join(categorical[:5])}")
+    return "\n".join(lines)

kontra/scout/reporters/markdown_reporter.py ADDED Viewed

@@ -0,0 +1,152 @@
+# src/kontra/scout/reporters/markdown_reporter.py
+"""
+Markdown reporter for Kontra Scout - documentation-friendly output.
+"""
+from __future__ import annotations
+from typing import List
+from kontra.scout.types import DatasetProfile, ColumnProfile
+def render_markdown(profile: DatasetProfile) -> str:
+    """
+    Render a DatasetProfile as Markdown.
+    Returns Markdown string suitable for documentation or GitHub.
+    """
+    lines: List[str] = []
+    # Header
+    lines.append(f"# Data Profile: {profile.source_uri}")
+    lines.append("")
+    # Summary
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"- **Format:** {profile.source_format}")
+    lines.append(f"- **Rows:** {profile.row_count:,}")
+    lines.append(f"- **Columns:** {profile.column_count}")
+    if profile.estimated_size_bytes:
+        size_mb = profile.estimated_size_bytes / (1024 * 1024)
+        lines.append(f"- **Size:** {size_mb:.1f} MB")
+    if profile.sampled:
+        lines.append(f"- **Sampled:** {profile.sample_size:,} rows")
+    lines.append(f"- **Profiled at:** {profile.profiled_at}")
+    lines.append(f"- **Duration:** {profile.profile_duration_ms} ms")
+    lines.append("")
+    # Schema table
+    lines.append("## Schema")
+    lines.append("")
+    lines.append("| Column | Type | Nulls | Distinct | Cardinality |")
+    lines.append("|--------|------|-------|----------|-------------|")
+    for col in profile.columns:
+        null_pct = f"{col.null_rate * 100:.1f}%"
+        distinct = f"{col.distinct_count:,}"
+        card = _cardinality_label(col)
+        lines.append(f"| {col.name} | {col.dtype} | {null_pct} | {distinct} | {card} |")
+    lines.append("")
+    # Low cardinality columns (categorical)
+    low_card_cols = [c for c in profile.columns if c.is_low_cardinality and c.values]
+    if low_card_cols:
+        lines.append("## Categorical Columns")
+        lines.append("")
+        for col in low_card_cols:
+            lines.append(f"### {col.name}")
+            lines.append("")
+            if col.values:
+                lines.append(f"**Values ({len(col.values)}):** `{', '.join(str(v) for v in col.values)}`")
+            if col.top_values:
+                lines.append("")
+                lines.append("| Value | Count | % |")
+                lines.append("|-------|-------|---|")
+                for tv in col.top_values:
+                    lines.append(f"| {tv.value} | {tv.count:,} | {tv.pct:.1f}% |")
+            lines.append("")
+    # Numeric columns
+    numeric_cols = [c for c in profile.columns if c.numeric]
+    if numeric_cols:
+        lines.append("## Numeric Columns")
+        lines.append("")
+        lines.append("| Column | Min | Max | Mean | Median | Std |")
+        lines.append("|--------|-----|-----|------|--------|-----|")
+        for col in numeric_cols:
+            n = col.numeric
+            lines.append(
+                f"| {col.name} | "
+                f"{_fmt(n.min)} | {_fmt(n.max)} | "
+                f"{_fmt(n.mean)} | {_fmt(n.median)} | {_fmt(n.std)} |"
+            )
+        lines.append("")
+    # String columns
+    string_cols = [c for c in profile.columns if c.string]
+    if string_cols:
+        lines.append("## String Columns")
+        lines.append("")
+        lines.append("| Column | Min Len | Max Len | Avg Len | Empty |")
+        lines.append("|--------|---------|---------|---------|-------|")
+        for col in string_cols:
+            s = col.string
+            lines.append(
+                f"| {col.name} | "
+                f"{s.min_length or 'N/A'} | {s.max_length or 'N/A'} | "
+                f"{_fmt(s.avg_length)} | {s.empty_count:,} |"
+            )
+        lines.append("")
+    # Temporal columns
+    temporal_cols = [c for c in profile.columns if c.temporal]
+    if temporal_cols:
+        lines.append("## Temporal Columns")
+        lines.append("")
+        lines.append("| Column | Min Date | Max Date |")
+        lines.append("|--------|----------|----------|")
+        for col in temporal_cols:
+            t = col.temporal
+            lines.append(f"| {col.name} | {t.date_min or 'N/A'} | {t.date_max or 'N/A'} |")
+        lines.append("")
+    # Pattern detection
+    pattern_cols = [c for c in profile.columns if c.detected_patterns]
+    if pattern_cols:
+        lines.append("## Detected Patterns")
+        lines.append("")
+        for col in pattern_cols:
+            patterns = ", ".join(col.detected_patterns)
+            lines.append(f"- **{col.name}:** {patterns}")
+        lines.append("")
+    # Footer
+    lines.append("---")
+    lines.append(f"*Generated by Kontra Scout v{profile.engine_version}*")
+    return "\n".join(lines)
+def _cardinality_label(col: ColumnProfile) -> str:
+    """Get cardinality label for a column."""
+    if col.uniqueness_ratio >= 0.99 and col.null_rate == 0:
+        return "unique"
+    if col.is_low_cardinality:
+        return "low"
+    if col.distinct_count < 100:
+        return "medium"
+    return "high"
+def _fmt(val: float | None) -> str:
+    """Format a number for Markdown."""
+    if val is None:
+        return "N/A"
+    if abs(val) >= 1000:
+        return f"{val:,.0f}"
+    if abs(val) >= 1:
+        return f"{val:.2f}"
+    return f"{val:.4f}"

kontra/scout/reporters/rich_reporter.py ADDED Viewed

@@ -0,0 +1,144 @@
+# src/kontra/scout/reporters/rich_reporter.py
+"""
+Rich console reporter for Kontra Scout.
+"""
+from __future__ import annotations
+from typing import List
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.text import Text
+from kontra.scout.types import DatasetProfile, ColumnProfile
+def render_rich(profile: DatasetProfile) -> str:
+    """
+    Render a DatasetProfile as Rich console output.
+    Returns a string representation (for compatibility with other reporters).
+    For direct console output, use print_rich() instead.
+    """
+    # Use a string buffer to capture output without duplicating
+    from io import StringIO
+    string_io = StringIO()
+    console = Console(file=string_io, force_terminal=True, width=120)
+    _print_to_console(console, profile)
+    return string_io.getvalue()
+def print_rich(profile: DatasetProfile) -> None:
+    """Print profile directly to console with Rich formatting."""
+    console = Console()
+    _print_to_console(console, profile)
+def _print_to_console(console: Console, profile: DatasetProfile) -> None:
+    """Internal: render profile to a console instance."""
+    # Header
+    title = f"[bold cyan]Kontra Scout[/bold cyan] - {profile.source_uri}"
+    size_str = ""
+    if profile.estimated_size_bytes:
+        size_mb = profile.estimated_size_bytes / (1024 * 1024)
+        size_str = f" | Size: {size_mb:.1f} MB"
+    sample_str = f" (sampled: {profile.sample_size:,} rows)" if profile.sampled else ""
+    header = (
+        f"Rows: [bold]{profile.row_count:,}[/bold] | "
+        f"Columns: [bold]{profile.column_count}[/bold]{size_str} | "
+        f"Duration: [bold]{profile.profile_duration_ms}[/bold] ms{sample_str}"
+    )
+    console.print(Panel(header, title=title, border_style="cyan"))
+    # Column table
+    table = Table(show_header=True, header_style="bold magenta", expand=True)
+    table.add_column("Column", style="cyan", no_wrap=True)
+    table.add_column("Type", style="green")
+    table.add_column("Nulls", justify="right")
+    table.add_column("Distinct", justify="right")
+    table.add_column("Cardinality")
+    table.add_column("Info")
+    for col in profile.columns:
+        null_pct = f"{col.null_rate * 100:.1f}%"
+        distinct_str = f"{col.distinct_count:,}"
+        # Cardinality classification
+        if col.uniqueness_ratio >= 0.99 and col.null_rate == 0:
+            card = "[bold green]unique[/bold green]"
+        elif col.is_low_cardinality:
+            if col.values:
+                vals = ", ".join(str(v) for v in col.values[:5])
+                if len(col.values) > 5:
+                    vals += f" +{len(col.values) - 5} more"
+                card = f"[yellow]low[/yellow]: [{vals}]"
+            else:
+                card = "[yellow]low[/yellow]"
+        elif col.distinct_count < 100:
+            card = "[blue]medium[/blue]"
+        else:
+            card = "high"
+        # Info column
+        info_parts: List[str] = []
+        if col.semantic_type:
+            info_parts.append(f"[dim]{col.semantic_type}[/dim]")
+        if col.detected_patterns:
+            info_parts.append(f"[magenta]{', '.join(col.detected_patterns)}[/magenta]")
+        if col.numeric:
+            info_parts.append(
+                f"[dim]min={_fmt_num(col.numeric.min)}, "
+                f"max={_fmt_num(col.numeric.max)}, "
+                f"mean={_fmt_num(col.numeric.mean)}[/dim]"
+            )
+        if col.temporal:
+            info_parts.append(f"[dim]{col.temporal.date_min} to {col.temporal.date_max}[/dim]")
+        table.add_row(
+            col.name,
+            col.dtype,
+            null_pct,
+            distinct_str,
+            card,
+            " | ".join(info_parts) if info_parts else "",
+        )
+    console.print(table)
+    # Top values section (if any columns have them)
+    cols_with_top = [c for c in profile.columns if c.top_values and c.is_low_cardinality]
+    if cols_with_top:
+        console.print()
+        console.print("[bold]Top Values:[/bold]")
+        for col in cols_with_top[:5]:  # Limit to 5 columns
+            vals = ", ".join(
+                f"{tv.value} ({tv.pct:.1f}%)" for tv in col.top_values[:3]
+            )
+            console.print(f"  [cyan]{col.name}[/cyan]: {vals}")
+    # Numeric summary
+    numeric_cols = [c for c in profile.columns if c.numeric]
+    if numeric_cols:
+        console.print()
+        console.print("[bold]Numeric Summary:[/bold]")
+        for col in numeric_cols[:5]:  # Limit to 5
+            n = col.numeric
+            console.print(
+                f"  [cyan]{col.name}[/cyan]: "
+                f"min={_fmt_num(n.min)}, max={_fmt_num(n.max)}, "
+                f"mean={_fmt_num(n.mean)}, median={_fmt_num(n.median)}"
+            )
+def _fmt_num(val: float | None) -> str:
+    """Format a number for display."""
+    if val is None:
+        return "N/A"
+    if abs(val) >= 1000:
+        return f"{val:,.0f}"
+    if abs(val) >= 1:
+        return f"{val:.2f}"
+    return f"{val:.4f}"