PyPI - kontra - Versions diffs - 0.5.2__py3-none-any.whl - Mend

kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

kontra/__init__.py +1871 -0
kontra/api/__init__.py +22 -0
kontra/api/compare.py +340 -0
kontra/api/decorators.py +153 -0
kontra/api/results.py +2121 -0
kontra/api/rules.py +681 -0
kontra/cli/__init__.py +0 -0
kontra/cli/commands/__init__.py +1 -0
kontra/cli/commands/config.py +153 -0
kontra/cli/commands/diff.py +450 -0
kontra/cli/commands/history.py +196 -0
kontra/cli/commands/profile.py +289 -0
kontra/cli/commands/validate.py +468 -0
kontra/cli/constants.py +6 -0
kontra/cli/main.py +48 -0
kontra/cli/renderers.py +304 -0
kontra/cli/utils.py +28 -0
kontra/config/__init__.py +34 -0
kontra/config/loader.py +127 -0
kontra/config/models.py +49 -0
kontra/config/settings.py +797 -0
kontra/connectors/__init__.py +0 -0
kontra/connectors/db_utils.py +251 -0
kontra/connectors/detection.py +323 -0
kontra/connectors/handle.py +368 -0
kontra/connectors/postgres.py +127 -0
kontra/connectors/sqlserver.py +226 -0
kontra/engine/__init__.py +0 -0
kontra/engine/backends/duckdb_session.py +227 -0
kontra/engine/backends/duckdb_utils.py +18 -0
kontra/engine/backends/polars_backend.py +47 -0
kontra/engine/engine.py +1205 -0
kontra/engine/executors/__init__.py +15 -0
kontra/engine/executors/base.py +50 -0
kontra/engine/executors/database_base.py +528 -0
kontra/engine/executors/duckdb_sql.py +607 -0
kontra/engine/executors/postgres_sql.py +162 -0
kontra/engine/executors/registry.py +69 -0
kontra/engine/executors/sqlserver_sql.py +163 -0
kontra/engine/materializers/__init__.py +14 -0
kontra/engine/materializers/base.py +42 -0
kontra/engine/materializers/duckdb.py +110 -0
kontra/engine/materializers/factory.py +22 -0
kontra/engine/materializers/polars_connector.py +131 -0
kontra/engine/materializers/postgres.py +157 -0
kontra/engine/materializers/registry.py +138 -0
kontra/engine/materializers/sqlserver.py +160 -0
kontra/engine/result.py +15 -0
kontra/engine/sql_utils.py +611 -0
kontra/engine/sql_validator.py +609 -0
kontra/engine/stats.py +194 -0
kontra/engine/types.py +138 -0
kontra/errors.py +533 -0
kontra/logging.py +85 -0
kontra/preplan/__init__.py +5 -0
kontra/preplan/planner.py +253 -0
kontra/preplan/postgres.py +179 -0
kontra/preplan/sqlserver.py +191 -0
kontra/preplan/types.py +24 -0
kontra/probes/__init__.py +20 -0
kontra/probes/compare.py +400 -0
kontra/probes/relationship.py +283 -0
kontra/reporters/__init__.py +0 -0
kontra/reporters/json_reporter.py +190 -0
kontra/reporters/rich_reporter.py +11 -0
kontra/rules/__init__.py +35 -0
kontra/rules/base.py +186 -0
kontra/rules/builtin/__init__.py +40 -0
kontra/rules/builtin/allowed_values.py +156 -0
kontra/rules/builtin/compare.py +188 -0
kontra/rules/builtin/conditional_not_null.py +213 -0
kontra/rules/builtin/conditional_range.py +310 -0
kontra/rules/builtin/contains.py +138 -0
kontra/rules/builtin/custom_sql_check.py +182 -0
kontra/rules/builtin/disallowed_values.py +140 -0
kontra/rules/builtin/dtype.py +203 -0
kontra/rules/builtin/ends_with.py +129 -0
kontra/rules/builtin/freshness.py +240 -0
kontra/rules/builtin/length.py +193 -0
kontra/rules/builtin/max_rows.py +35 -0
kontra/rules/builtin/min_rows.py +46 -0
kontra/rules/builtin/not_null.py +121 -0
kontra/rules/builtin/range.py +222 -0
kontra/rules/builtin/regex.py +143 -0
kontra/rules/builtin/starts_with.py +129 -0
kontra/rules/builtin/unique.py +124 -0
kontra/rules/condition_parser.py +203 -0
kontra/rules/execution_plan.py +455 -0
kontra/rules/factory.py +103 -0
kontra/rules/predicates.py +25 -0
kontra/rules/registry.py +24 -0
kontra/rules/static_predicates.py +120 -0
kontra/scout/__init__.py +9 -0
kontra/scout/backends/__init__.py +17 -0
kontra/scout/backends/base.py +111 -0
kontra/scout/backends/duckdb_backend.py +359 -0
kontra/scout/backends/postgres_backend.py +519 -0
kontra/scout/backends/sqlserver_backend.py +577 -0
kontra/scout/dtype_mapping.py +150 -0
kontra/scout/patterns.py +69 -0
kontra/scout/profiler.py +801 -0
kontra/scout/reporters/__init__.py +39 -0
kontra/scout/reporters/json_reporter.py +165 -0
kontra/scout/reporters/markdown_reporter.py +152 -0
kontra/scout/reporters/rich_reporter.py +144 -0
kontra/scout/store.py +208 -0
kontra/scout/suggest.py +200 -0
kontra/scout/types.py +652 -0
kontra/state/__init__.py +29 -0
kontra/state/backends/__init__.py +79 -0
kontra/state/backends/base.py +348 -0
kontra/state/backends/local.py +480 -0
kontra/state/backends/postgres.py +1010 -0
kontra/state/backends/s3.py +543 -0
kontra/state/backends/sqlserver.py +969 -0
kontra/state/fingerprint.py +166 -0
kontra/state/types.py +1061 -0
kontra/version.py +1 -0
kontra-0.5.2.dist-info/METADATA +122 -0
kontra-0.5.2.dist-info/RECORD +124 -0
kontra-0.5.2.dist-info/WHEEL +5 -0
kontra-0.5.2.dist-info/entry_points.txt +2 -0
kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
kontra-0.5.2.dist-info/top_level.txt +1 -0

kontra/api/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# src/kontra/api/__init__.py
+"""
+Kontra Python API - Public interface classes and functions.
+"""
+from kontra.api.results import (
+    ValidationResult,
+    RuleResult,
+    Diff,
+    Suggestions,
+    SuggestedRule,
+)
+from kontra.api.rules import rules
+__all__ = [
+    "ValidationResult",
+    "RuleResult",
+    "Diff",
+    "Suggestions",
+    "SuggestedRule",
+    "rules",
+]

kontra/api/compare.py ADDED Viewed

@@ -0,0 +1,340 @@
+# src/kontra/api/compare.py
+"""
+Result types for transformation probes.
+These are the structured result types returned by compare() and
+profile_relationship() probes.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+@dataclass
+class CompareResult:
+    """
+    Result of comparing two datasets to measure transformation effects.
+    Answers: "Did my transformation preserve rows and keys as expected?"
+    Does NOT answer: whether the transformation is "correct".
+    All measurements are deterministic and factual. Interpretation
+    belongs to the consumer (agent or human).
+    Attributes:
+        # Meta
+        before_rows: Number of rows in before dataset
+        after_rows: Number of rows in after dataset
+        key: Column(s) used as row identifier
+        execution_tier: Which execution tier computed the result ("polars" | "sql")
+        # Row stats
+        row_delta: Change in row count (after - before)
+        row_ratio: Ratio of after/before rows
+        # Key stats
+        unique_before: Count of unique keys in before
+        unique_after: Count of unique keys in after
+        preserved: Keys present in both before and after
+        dropped: Keys in before but not in after
+        added: Keys in after but not in before
+        duplicated_after: Keys appearing >1x in after (not row count, key count)
+        # Change stats (for preserved keys only)
+        unchanged_rows: Rows where no non-key columns changed
+        changed_rows: Rows where at least one non-key column changed
+        # Column stats
+        columns_added: Columns in after but not in before
+        columns_removed: Columns in before but not in after
+        columns_modified: Columns in both where at least one value differs
+        modified_fraction: {col: fraction of preserved rows where col changed}
+        nullability_delta: {col: {before: rate, after: rate}}
+        # Samples (bounded, explanatory only)
+        samples_duplicated_keys: Sample keys appearing >1x in after
+        samples_dropped_keys: Sample keys dropped from before
+        samples_changed_rows: Sample changed rows with before/after values
+        # Config
+        sample_limit: Maximum samples per category
+    Semantic Definitions:
+        - changed_rows: Structural value change. Any non-key column inequality
+          counts as a change. NULL → value and value → NULL are changes.
+          Computed only for preserved keys.
+        - duplicated_after: Count of keys (not rows) that appear more than once
+          in the after dataset. A key appearing 3x contributes 1 to this count.
+        - modified_fraction: For each modified column, the fraction of preserved
+          rows where that column's value differs between before and after.
+    """
+    # Meta
+    before_rows: int
+    after_rows: int
+    key: List[str]
+    execution_tier: str = "polars"
+    # Row stats
+    row_delta: int = 0
+    row_ratio: float = 1.0
+    # Key stats
+    unique_before: int = 0
+    unique_after: int = 0
+    preserved: int = 0
+    dropped: int = 0
+    added: int = 0
+    duplicated_after: int = 0
+    # Change stats
+    unchanged_rows: int = 0
+    changed_rows: int = 0
+    # Column stats
+    columns_added: List[str] = field(default_factory=list)
+    columns_removed: List[str] = field(default_factory=list)
+    columns_modified: List[str] = field(default_factory=list)
+    modified_fraction: Dict[str, float] = field(default_factory=dict)
+    nullability_delta: Dict[str, Dict[str, Optional[float]]] = field(default_factory=dict)
+    # Samples
+    samples_duplicated_keys: List[Any] = field(default_factory=list)
+    samples_dropped_keys: List[Any] = field(default_factory=list)
+    samples_changed_rows: List[Dict[str, Any]] = field(default_factory=list)
+    # Config
+    sample_limit: int = 5
+    def __repr__(self) -> str:
+        delta_sign = "+" if self.row_delta >= 0 else ""
+        return (
+            f"CompareResult(rows: {self.before_rows:,} → {self.after_rows:,} "
+            f"({delta_sign}{self.row_delta:,}), "
+            f"keys: preserved={self.preserved:,}, dropped={self.dropped:,}, added={self.added:,}, "
+            f"duplicated={self.duplicated_after:,})"
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert to dictionary format matching the MVP schema.
+        Returns nested structure with meta, row_stats, key_stats,
+        change_stats, column_stats, and samples sections.
+        """
+        return {
+            "meta": {
+                "before_rows": self.before_rows,
+                "after_rows": self.after_rows,
+                "key": self.key,
+                "execution_tier": self.execution_tier,
+            },
+            "row_stats": {
+                "delta": self.row_delta,
+                "ratio": self.row_ratio,
+            },
+            "key_stats": {
+                "unique_before": self.unique_before,
+                "unique_after": self.unique_after,
+                "preserved": self.preserved,
+                "dropped": self.dropped,
+                "added": self.added,
+                "duplicated_after": self.duplicated_after,
+            },
+            "change_stats": {
+                "unchanged_rows": self.unchanged_rows,
+                "changed_rows": self.changed_rows,
+            },
+            "column_stats": {
+                "added": self.columns_added,
+                "removed": self.columns_removed,
+                "modified": self.columns_modified,
+                "modified_fraction": self.modified_fraction,
+                "nullability_delta": self.nullability_delta,
+            },
+            "samples": {
+                "duplicated_keys": self.samples_duplicated_keys,
+                "dropped_keys": self.samples_dropped_keys,
+                "changed_rows": self.samples_changed_rows,
+            },
+        }
+    def to_json(self, indent: Optional[int] = 2) -> str:
+        """Convert to JSON string."""
+        return json.dumps(self.to_dict(), indent=indent, default=str)
+    def to_llm(self) -> str:
+        """
+        Token-optimized format for LLM context.
+        This is a thin wrapper over to_dict(). No summarization, no prose.
+        Agents prompt themselves for interpretation.
+        """
+        return json.dumps(self.to_dict(), indent=2, default=str)
+@dataclass
+class RelationshipProfile:
+    """
+    Result of profiling the structural relationship between two datasets.
+    Answers: "What is the shape of this join?"
+    Does NOT answer: which join type to use, or whether the join is correct.
+    All measurements are deterministic and factual. Interpretation
+    belongs to the consumer (agent or human).
+    Attributes:
+        # Meta
+        on: Column(s) used as join key
+        left_rows: Number of rows in left dataset
+        right_rows: Number of rows in right dataset
+        execution_tier: Which execution tier computed the result
+        # Key stats - left
+        left_null_rate: Fraction of rows with NULL in join key
+        left_unique_keys: Count of unique key values
+        left_duplicate_keys: Number of keys appearing >1x
+        # Key stats - right
+        right_null_rate: Fraction of rows with NULL in join key
+        right_unique_keys: Count of unique key values
+        right_duplicate_keys: Number of keys appearing >1x
+        # Cardinality (rows per key)
+        # NOTE: min/max hides distribution shape. A single pathological key
+        # with max=1000 looks like "many-to-many" even if 99.9% of keys have
+        # 1 row. Acceptable for MVP since samples are present and we don't label.
+        left_key_multiplicity_min: Minimum rows per key (left)
+        left_key_multiplicity_max: Maximum rows per key (left)
+        right_key_multiplicity_min: Minimum rows per key (right)
+        right_key_multiplicity_max: Maximum rows per key (right)
+        # Coverage
+        left_keys_with_match: Keys in left that exist in right
+        left_keys_without_match: Keys in left that don't exist in right
+        right_keys_with_match: Keys in right that exist in left
+        right_keys_without_match: Keys in right that don't exist in left
+        # Samples (bounded, explanatory only)
+        samples_left_unmatched: Sample keys in left without match
+        samples_right_unmatched: Sample keys in right without match
+        samples_right_duplicates: Sample keys with >1 row in right
+        # Config
+        sample_limit: Maximum samples per category
+    """
+    # Meta
+    on: List[str]
+    left_rows: int
+    right_rows: int
+    execution_tier: str = "polars"
+    # Key stats - left
+    left_null_rate: float = 0.0
+    left_unique_keys: int = 0
+    left_duplicate_keys: int = 0
+    # Key stats - right
+    right_null_rate: float = 0.0
+    right_unique_keys: int = 0
+    right_duplicate_keys: int = 0
+    # Cardinality
+    left_key_multiplicity_min: int = 0
+    left_key_multiplicity_max: int = 0
+    right_key_multiplicity_min: int = 0
+    right_key_multiplicity_max: int = 0
+    # Coverage
+    left_keys_with_match: int = 0
+    left_keys_without_match: int = 0
+    right_keys_with_match: int = 0
+    right_keys_without_match: int = 0
+    # Samples
+    samples_left_unmatched: List[Any] = field(default_factory=list)
+    samples_right_unmatched: List[Any] = field(default_factory=list)
+    samples_right_duplicates: List[Any] = field(default_factory=list)
+    # Config
+    sample_limit: int = 5
+    def __repr__(self) -> str:
+        return (
+            f"RelationshipProfile(on={self.on}, "
+            f"left={self.left_rows:,} rows/{self.left_unique_keys:,} keys, "
+            f"right={self.right_rows:,} rows/{self.right_unique_keys:,} keys, "
+            f"coverage: left={self.left_keys_with_match:,}/{self.left_unique_keys:,}, "
+            f"right={self.right_keys_with_match:,}/{self.right_unique_keys:,})"
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert to dictionary format matching the MVP schema.
+        Returns nested structure with meta, key_stats, cardinality,
+        coverage, and samples sections.
+        """
+        return {
+            "meta": {
+                "on": self.on,
+                "left_rows": self.left_rows,
+                "right_rows": self.right_rows,
+                "execution_tier": self.execution_tier,
+            },
+            "key_stats": {
+                "left": {
+                    "null_rate": self.left_null_rate,
+                    "unique_keys": self.left_unique_keys,
+                    "duplicate_keys": self.left_duplicate_keys,
+                    "rows": self.left_rows,
+                },
+                "right": {
+                    "null_rate": self.right_null_rate,
+                    "unique_keys": self.right_unique_keys,
+                    "duplicate_keys": self.right_duplicate_keys,
+                    "rows": self.right_rows,
+                },
+            },
+            "cardinality": {
+                "left_key_multiplicity": {
+                    "min": self.left_key_multiplicity_min,
+                    "max": self.left_key_multiplicity_max,
+                },
+                "right_key_multiplicity": {
+                    "min": self.right_key_multiplicity_min,
+                    "max": self.right_key_multiplicity_max,
+                },
+            },
+            "coverage": {
+                "left_keys_with_match": self.left_keys_with_match,
+                "left_keys_without_match": self.left_keys_without_match,
+                "right_keys_with_match": self.right_keys_with_match,
+                "right_keys_without_match": self.right_keys_without_match,
+            },
+            "samples": {
+                "left_keys_without_match": self.samples_left_unmatched,
+                "right_keys_without_match": self.samples_right_unmatched,
+                "right_keys_with_multiple_rows": self.samples_right_duplicates,
+            },
+        }
+    def to_json(self, indent: Optional[int] = 2) -> str:
+        """Convert to JSON string."""
+        return json.dumps(self.to_dict(), indent=indent, default=str)
+    def to_llm(self) -> str:
+        """
+        Token-optimized format for LLM context.
+        This is a thin wrapper over to_dict(). No summarization, no prose.
+        Agents prompt themselves for interpretation.
+        """
+        return json.dumps(self.to_dict(), indent=2, default=str)

kontra/api/decorators.py ADDED Viewed

@@ -0,0 +1,153 @@
+# src/kontra/api/decorators.py
+"""
+Pipeline validation decorators for Kontra.
+Decorators for validating data returned from functions.
+"""
+from __future__ import annotations
+import functools
+import warnings
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    TypeVar,
+    Union,
+)
+from kontra.errors import ValidationError
+F = TypeVar("F", bound=Callable[..., Any])
+# Built-in mode shortcuts
+OnFailMode = Literal["raise", "warn", "return_result"]
+# Callback signature: (result, data) -> data (or raise)
+OnFailCallback = Callable[["ValidationResult", Any], Any]  # type: ignore
+# Accept either a mode string or a callback
+OnFailHandler = Union[OnFailMode, OnFailCallback]
+def validate(
+    contract: Optional[str] = None,
+    rules: Optional[List[Dict[str, Any]]] = None,
+    on_fail: OnFailHandler = "raise",
+    save: bool = False,
+    sample: int = 0,
+    sample_columns: Optional[Union[List[str], str]] = None,
+) -> Callable[[F], F]:
+    """
+    Decorator to validate data returned from a function.
+    The decorated function must return a DataFrame (Polars or pandas)
+    or other data type supported by `kontra.validate()`.
+    Args:
+        contract: Path to a YAML contract file
+        rules: List of rule definitions (alternative to contract)
+        on_fail: Action when validation fails. Either a mode string or a callback:
+            - "raise": Raise ValidationError on blocking failures (default)
+            - "warn": Log warning, return data anyway
+            - "return_result": Return (data, ValidationResult) tuple
+            - Callable[[ValidationResult, data], data]: Custom handler
+        save: Whether to save the validation result to state
+        sample: Number of sample rows to collect for failures
+        sample_columns: Columns to include in samples (None=all, list, or "relevant")
+    Returns:
+        Decorated function
+    Raises:
+        ValueError: If neither contract nor rules is provided
+        ValidationError: If on_fail="raise" and validation has blocking failures
+    Example:
+        ```python
+        import kontra
+        from kontra import rules
+        # Built-in modes
+        @kontra.validate_decorator(
+            rules=[rules.not_null("id"), rules.unique("email")],
+            on_fail="raise"
+        )
+        def load_users() -> pl.DataFrame:
+            return pl.read_parquet("users.parquet")
+        # Custom callback - Kontra measures, you decide
+        def notify_slack(result, data):
+            if not result.passed:
+                slack.post(f"Validation failed: {result.failed_count} violations")
+            return data  # or raise, or transform, etc.
+        @kontra.validate_decorator(
+            rules=[rules.not_null("id")],
+            on_fail=notify_slack
+        )
+        def fetch_orders():
+            return db.query("SELECT * FROM orders")
+        ```
+    """
+    if contract is None and rules is None:
+        raise ValueError("Either 'contract' or 'rules' must be provided")
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            # Import here to avoid circular imports
+            import kontra
+            # Call the original function
+            data = func(*args, **kwargs)
+            # Validate the returned data
+            result = kontra.validate(
+                data,
+                contract=contract,
+                rules=rules,
+                save=save,
+                sample=sample,
+                sample_columns=sample_columns,
+            )
+            # Handle based on on_fail mode or callback
+            if callable(on_fail) and not isinstance(on_fail, str):
+                # User-provided callback: Kontra measured, user decides
+                return on_fail(result, data)
+            if on_fail == "return_result":
+                return (data, result)
+            if not result.passed:
+                # Check for blocking failures
+                blocking_failures = [
+                    r for r in result.rules if not r.passed and r.severity == "blocking"
+                ]
+                if blocking_failures:
+                    if on_fail == "raise":
+                        raise ValidationError(result)
+                    elif on_fail == "warn":
+                        # Log warning
+                        warnings.warn(
+                            f"Validation failed in {func.__name__}: "
+                            f"{len(blocking_failures)} blocking rule(s) failed "
+                            f"({result.failed_count} total violations)",
+                            UserWarning,
+                            stacklevel=2,
+                        )
+            return data
+        return wrapper  # type: ignore
+    return decorator
+# Alias for import convenience
+validate_decorator = validate