PyPI - odibi - Versions diffs - 2.5.0__py3-none-any.whl - Mend

odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

odibi/__init__.py +32 -0
odibi/__main__.py +8 -0
odibi/catalog.py +3011 -0
odibi/cli/__init__.py +11 -0
odibi/cli/__main__.py +6 -0
odibi/cli/catalog.py +553 -0
odibi/cli/deploy.py +69 -0
odibi/cli/doctor.py +161 -0
odibi/cli/export.py +66 -0
odibi/cli/graph.py +150 -0
odibi/cli/init_pipeline.py +242 -0
odibi/cli/lineage.py +259 -0
odibi/cli/main.py +215 -0
odibi/cli/run.py +98 -0
odibi/cli/schema.py +208 -0
odibi/cli/secrets.py +232 -0
odibi/cli/story.py +379 -0
odibi/cli/system.py +132 -0
odibi/cli/test.py +286 -0
odibi/cli/ui.py +31 -0
odibi/cli/validate.py +39 -0
odibi/config.py +3541 -0
odibi/connections/__init__.py +9 -0
odibi/connections/azure_adls.py +499 -0
odibi/connections/azure_sql.py +709 -0
odibi/connections/base.py +28 -0
odibi/connections/factory.py +322 -0
odibi/connections/http.py +78 -0
odibi/connections/local.py +119 -0
odibi/connections/local_dbfs.py +61 -0
odibi/constants.py +17 -0
odibi/context.py +528 -0
odibi/diagnostics/__init__.py +12 -0
odibi/diagnostics/delta.py +520 -0
odibi/diagnostics/diff.py +169 -0
odibi/diagnostics/manager.py +171 -0
odibi/engine/__init__.py +20 -0
odibi/engine/base.py +334 -0
odibi/engine/pandas_engine.py +2178 -0
odibi/engine/polars_engine.py +1114 -0
odibi/engine/registry.py +54 -0
odibi/engine/spark_engine.py +2362 -0
odibi/enums.py +7 -0
odibi/exceptions.py +297 -0
odibi/graph.py +426 -0
odibi/introspect.py +1214 -0
odibi/lineage.py +511 -0
odibi/node.py +3341 -0
odibi/orchestration/__init__.py +0 -0
odibi/orchestration/airflow.py +90 -0
odibi/orchestration/dagster.py +77 -0
odibi/patterns/__init__.py +24 -0
odibi/patterns/aggregation.py +599 -0
odibi/patterns/base.py +94 -0
odibi/patterns/date_dimension.py +423 -0
odibi/patterns/dimension.py +696 -0
odibi/patterns/fact.py +748 -0
odibi/patterns/merge.py +128 -0
odibi/patterns/scd2.py +148 -0
odibi/pipeline.py +2382 -0
odibi/plugins.py +80 -0
odibi/project.py +581 -0
odibi/references.py +151 -0
odibi/registry.py +246 -0
odibi/semantics/__init__.py +71 -0
odibi/semantics/materialize.py +392 -0
odibi/semantics/metrics.py +361 -0
odibi/semantics/query.py +743 -0
odibi/semantics/runner.py +430 -0
odibi/semantics/story.py +507 -0
odibi/semantics/views.py +432 -0
odibi/state/__init__.py +1203 -0
odibi/story/__init__.py +55 -0
odibi/story/doc_story.py +554 -0
odibi/story/generator.py +1431 -0
odibi/story/lineage.py +1043 -0
odibi/story/lineage_utils.py +324 -0
odibi/story/metadata.py +608 -0
odibi/story/renderers.py +453 -0
odibi/story/templates/run_story.html +2520 -0
odibi/story/themes.py +216 -0
odibi/testing/__init__.py +13 -0
odibi/testing/assertions.py +75 -0
odibi/testing/fixtures.py +85 -0
odibi/testing/source_pool.py +277 -0
odibi/transformers/__init__.py +122 -0
odibi/transformers/advanced.py +1472 -0
odibi/transformers/delete_detection.py +610 -0
odibi/transformers/manufacturing.py +1029 -0
odibi/transformers/merge_transformer.py +778 -0
odibi/transformers/relational.py +675 -0
odibi/transformers/scd.py +579 -0
odibi/transformers/sql_core.py +1356 -0
odibi/transformers/validation.py +165 -0
odibi/ui/__init__.py +0 -0
odibi/ui/app.py +195 -0
odibi/utils/__init__.py +66 -0
odibi/utils/alerting.py +667 -0
odibi/utils/config_loader.py +343 -0
odibi/utils/console.py +231 -0
odibi/utils/content_hash.py +202 -0
odibi/utils/duration.py +43 -0
odibi/utils/encoding.py +102 -0
odibi/utils/extensions.py +28 -0
odibi/utils/hashing.py +61 -0
odibi/utils/logging.py +203 -0
odibi/utils/logging_context.py +740 -0
odibi/utils/progress.py +429 -0
odibi/utils/setup_helpers.py +302 -0
odibi/utils/telemetry.py +140 -0
odibi/validation/__init__.py +62 -0
odibi/validation/engine.py +765 -0
odibi/validation/explanation_linter.py +155 -0
odibi/validation/fk.py +547 -0
odibi/validation/gate.py +252 -0
odibi/validation/quarantine.py +605 -0
odibi/writers/__init__.py +15 -0
odibi/writers/sql_server_writer.py +2081 -0
odibi-2.5.0.dist-info/METADATA +255 -0
odibi-2.5.0.dist-info/RECORD +124 -0
odibi-2.5.0.dist-info/WHEEL +5 -0
odibi-2.5.0.dist-info/entry_points.txt +2 -0
odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
odibi-2.5.0.dist-info/top_level.txt +1 -0

odibi/utils/setup_helpers.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""Setup helpers for ODIBI - Phase 2C performance utilities."""
+import concurrent.futures
+import warnings
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+@dataclass
+class KeyVaultFetchResult:
+    """Result of a Key Vault secret fetch operation."""
+    connection_name: str
+    account: str
+    success: bool
+    secret_value: Optional[str] = None
+    error: Optional[Exception] = None
+    duration_ms: Optional[float] = None
+def fetch_keyvault_secret(
+    connection_name: str,
+    key_vault_name: str,
+    secret_name: str,
+    timeout: float = 30.0,
+) -> KeyVaultFetchResult:
+    """Fetch a single Key Vault secret with timeout protection.
+    Args:
+        connection_name: Name of the connection (for error reporting)
+        key_vault_name: Azure Key Vault name
+        secret_name: Secret name in Key Vault
+        timeout: Timeout in seconds (default: 30.0)
+    Returns:
+        KeyVaultFetchResult with success status and secret value or error
+    """
+    import time
+    start_time = time.time()
+    try:
+        from azure.identity import DefaultAzureCredential
+        from azure.keyvault.secrets import SecretClient
+    except ImportError:
+        duration_ms = (time.time() - start_time) * 1000
+        return KeyVaultFetchResult(
+            connection_name=connection_name,
+            account=key_vault_name,
+            success=False,
+            error=ImportError(
+                "Key Vault authentication requires 'azure-identity' and 'azure-keyvault-secrets'. "
+                "Install with: pip install odibi[azure]"
+            ),
+            duration_ms=duration_ms,
+        )
+    try:
+        credential = DefaultAzureCredential()
+        kv_uri = f"https://{key_vault_name}.vault.azure.net"
+        client = SecretClient(vault_url=kv_uri, credential=credential)
+        secret = client.get_secret(secret_name)
+        duration_ms = (time.time() - start_time) * 1000
+        return KeyVaultFetchResult(
+            connection_name=connection_name,
+            account=key_vault_name,
+            success=True,
+            secret_value=secret.value,
+            duration_ms=duration_ms,
+        )
+    except Exception as e:
+        duration_ms = (time.time() - start_time) * 1000
+        return KeyVaultFetchResult(
+            connection_name=connection_name,
+            account=key_vault_name,
+            success=False,
+            error=e,
+            duration_ms=duration_ms,
+        )
+def fetch_keyvault_secrets_parallel(
+    connections: Dict[str, Any],
+    max_workers: int = 5,
+    timeout: float = 30.0,
+    verbose: bool = True,
+) -> Dict[str, KeyVaultFetchResult]:
+    """Fetch Key Vault secrets in parallel for multiple connections.
+    This provides 3x+ performance improvement over sequential fetching
+    when multiple ADLS connections use Key Vault authentication.
+    Args:
+        connections: Dictionary of connection objects (name -> connection instance)
+        max_workers: Maximum number of parallel workers (default: 5)
+        timeout: Timeout per secret fetch in seconds (default: 30.0)
+        verbose: Print progress messages
+    Returns:
+        Dictionary mapping connection name to KeyVaultFetchResult
+    Example:
+        >>> from odibi.connections import AzureADLS
+        >>> connections = {
+        ...     "bronze": AzureADLS(account="storage1", container="bronze", auth_mode="key_vault",
+        ...                         key_vault_name="kv1", secret_name="secret1", validate=False),
+        ...     "silver": AzureADLS(account="storage2", container="silver", auth_mode="key_vault",
+        ...                         key_vault_name="kv2", secret_name="secret2", validate=False),
+        ... }
+        >>> results = fetch_keyvault_secrets_parallel(connections)
+        >>> all(r.success for r in results.values())
+        True
+    """
+    import time
+    kv_connections = []
+    results = {}
+    for name, conn in connections.items():
+        # Check if connection is configured to use Key Vault (has vault name and secret name)
+        # This supports ANY auth mode (key_vault, sas_token, service_principal, sql, etc.)
+        # as long as they want to fetch a credential from KV.
+        if (
+            hasattr(conn, "key_vault_name")
+            and conn.key_vault_name
+            and hasattr(conn, "secret_name")
+            and conn.secret_name
+        ):
+            kv_connections.append((name, conn))
+        else:
+            results[name] = KeyVaultFetchResult(
+                connection_name=name,
+                account=getattr(conn, "account", "unknown"),
+                success=True,
+                secret_value=None,
+                duration_ms=0.0,
+            )
+    if not kv_connections:
+        if verbose:
+            print("- No Key Vault connections to fetch")
+        return results
+    if verbose:
+        print(f"⚡ Fetching {len(kv_connections)} Key Vault secrets in parallel...")
+    start_time = time.time()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_conn = {
+            executor.submit(
+                fetch_keyvault_secret,
+                name,
+                conn.key_vault_name,
+                conn.secret_name,
+                timeout,
+            ): (name, conn)
+            for name, conn in kv_connections
+        }
+        for future in concurrent.futures.as_completed(future_to_conn):
+            name, conn = future_to_conn[future]
+            result = future.result()
+            results[name] = result
+            if verbose:
+                if result.success:
+                    print(f"  - {name}: {result.duration_ms:.0f}ms")
+                else:
+                    print(f"  [X] {name}: {type(result.error).__name__}")
+    total_duration = (time.time() - start_time) * 1000
+    if verbose:
+        success_count = sum(1 for r in results.values() if r.success)
+        print(
+            f"- Completed in {total_duration:.0f}ms ({success_count}/{len(kv_connections)} successful)"
+        )
+    return results
+def configure_connections_parallel(
+    connections: Dict[str, Any],
+    prefetch_secrets: bool = True,
+    max_workers: int = 5,
+    timeout: float = 30.0,
+    verbose: bool = True,
+) -> Tuple[Dict[str, Any], List[str]]:
+    """Configure connections with parallel Key Vault fetching.
+    Args:
+        connections: Dictionary of connection objects
+        prefetch_secrets: Whether to prefetch Key Vault secrets in parallel
+        max_workers: Maximum parallel workers
+        timeout: Timeout per operation
+        verbose: Print progress messages
+    Returns:
+        Tuple of (configured_connections, errors)
+            - configured_connections: Dict with cached secrets
+            - errors: List of error messages
+    Example:
+        >>> connections, errors = configure_connections_parallel(my_connections)
+        >>> if errors:
+        ...     print("Errors:", errors)
+    """
+    errors = []
+    if not prefetch_secrets:
+        return connections, errors
+    results = fetch_keyvault_secrets_parallel(
+        connections, max_workers=max_workers, timeout=timeout, verbose=verbose
+    )
+    for name, result in results.items():
+        if not result.success:
+            error_msg = f"Failed to fetch secret for '{name}': {result.error}"
+            errors.append(error_msg)
+            if verbose:
+                warnings.warn(error_msg, UserWarning)
+        elif result.secret_value:
+            conn = connections[name]
+            if hasattr(conn, "_cached_key"):
+                conn._cached_key = result.secret_value
+    return connections, errors
+def validate_databricks_environment(verbose: bool = True) -> Dict[str, Any]:
+    """Validate that we're running in a Databricks environment.
+    Args:
+        verbose: Print validation results
+    Returns:
+        Dictionary with validation results:
+            - is_databricks: bool
+            - spark_available: bool
+            - dbutils_available: bool
+            - runtime_version: Optional[str]
+            - errors: List[str]
+    Example:
+        >>> info = validate_databricks_environment()
+        >>> if info["is_databricks"]:
+        ...     print("Running in Databricks")
+    """
+    results = {
+        "is_databricks": False,
+        "spark_available": False,
+        "dbutils_available": False,
+        "runtime_version": None,
+        "errors": [],
+    }
+    try:
+        import os
+        runtime = os.getenv("DATABRICKS_RUNTIME_VERSION")
+        if runtime:
+            results["is_databricks"] = True
+            results["runtime_version"] = runtime
+    except Exception as e:
+        results["errors"].append(f"Environment check failed: {e}")
+    try:
+        from pyspark.sql import SparkSession
+        spark = SparkSession.getActiveSession()
+        if spark:
+            results["spark_available"] = True
+    except Exception as e:
+        results["errors"].append(f"Spark check failed: {e}")
+    try:
+        import IPython
+        ipython = IPython.get_ipython()
+        if ipython and hasattr(ipython, "user_ns") and "dbutils" in ipython.user_ns:
+            results["dbutils_available"] = True
+    except Exception as e:
+        results["errors"].append(f"dbutils check failed: {e}")
+    if verbose:
+        print(f"  Databricks Runtime: {'[X]' if results['is_databricks'] else '[ ]'}")
+        if results["runtime_version"]:
+            print(f"  Runtime Version: {results['runtime_version']}")
+        print(f"  Spark Available: {'[X]' if results['spark_available'] else '[ ]'}")
+        print(f"  dbutils Available: {'[X]' if results['dbutils_available'] else '[ ]'}")
+        if results["errors"]:
+            print("\n  Errors:")
+            for error in results["errors"]:
+                print(f"    - {error}")
+    return results

odibi/utils/telemetry.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""Telemetry utilities for OpenTelemetry integration."""
+import os
+import sys
+try:
+    from opentelemetry import metrics, trace
+    from opentelemetry.trace import Status, StatusCode
+    AVAILABLE = True
+except ImportError:
+    AVAILABLE = False
+# --- Mock Classes for when OTel is missing ---
+class StatusCode:
+    OK = 1
+    ERROR = 2
+class Status:
+    def __init__(self, status_code, description=""):
+        pass
+class MockSpan:
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        pass
+    def set_attribute(self, key, value):
+        pass
+    def set_status(self, status):
+        pass
+    def record_exception(self, exception):
+        pass
+    def add_event(self, name, attributes=None):
+        pass
+class MockTracer:
+    def start_as_current_span(self, name, kind=None, attributes=None):
+        return MockSpan()
+class MockCounter:
+    def add(self, amount, attributes=None):
+        pass
+class MockHistogram:
+    def record(self, amount, attributes=None):
+        pass
+class MockMeter:
+    def create_counter(self, name, unit="", description=""):
+        return MockCounter()
+    def create_histogram(self, name, unit="", description=""):
+        return MockHistogram()
+# --- Public API ---
+def get_tracer(name: str):
+    """Get a tracer (real or mock)."""
+    if AVAILABLE:
+        return trace.get_tracer(name)
+    return MockTracer()
+def get_meter(name: str):
+    """Get a meter (real or mock)."""
+    if AVAILABLE:
+        return metrics.get_meter(name)
+    return MockMeter()
+def setup_telemetry(service_name: str = "odibi"):
+    """Configure OpenTelemetry if available and configured.
+    Checks OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
+    If set, configures OTLP exporter.
+    """
+    if not AVAILABLE:
+        return
+    # Check for OTLP endpoint
+    endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
+    if not endpoint:
+        return
+    try:
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+        from opentelemetry.sdk.resources import Resource
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import BatchSpanProcessor
+        # Initialize Provider
+        resource = Resource.create(attributes={"service.name": service_name})
+        provider = TracerProvider(resource=resource)
+        # OTLP Exporter
+        exporter = OTLPSpanExporter(endpoint=endpoint)
+        processor = BatchSpanProcessor(exporter)
+        provider.add_span_processor(processor)
+        # Set Global
+        trace.set_tracer_provider(provider)
+    except ImportError:
+        # OTLP exporter might not be installed
+        pass
+    except Exception as e:
+        print(f"Warning: Failed to initialize OpenTelemetry: {e}", file=sys.stderr)
+# --- Global Instances ---
+tracer = get_tracer("odibi")
+meter = get_meter("odibi")
+# Metrics
+nodes_executed = meter.create_counter(
+    "odibi.nodes_executed", description="Number of nodes executed"
+)
+rows_processed = meter.create_counter("odibi.rows_processed", description="Total rows processed")
+node_duration = meter.create_histogram(
+    "odibi.node_duration", unit="s", description="Duration of node execution"
+)

odibi/validation/__init__.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+Quality Enforcement and Validation
+===================================
+This module enforces Odibi's quality standards through automated validation.
+Features:
+- Explanation linting: Ensure transformations are documented
+- Quality scoring: Detect generic/lazy documentation
+- Schema validation: Verify config structure
+- Pre-run validation: Catch errors before execution
+- Quarantine tables: Route failed rows to dedicated tables
+- Quality gates: Batch-level validation thresholds
+- FK validation: Referential integrity checks for star schemas
+Principle: Enforce excellence, don't hope for it.
+"""
+from .engine import Validator
+from .explanation_linter import ExplanationLinter, LintIssue
+from .fk import (
+    FKValidationReport,
+    FKValidationResult,
+    FKValidator,
+    OrphanRecord,
+    RelationshipConfig,
+    RelationshipRegistry,
+    get_orphan_records,
+    parse_relationships_config,
+    validate_fk_on_load,
+)
+from .gate import GateResult, evaluate_gate
+from .quarantine import (
+    QuarantineResult,
+    add_quarantine_metadata,
+    has_quarantine_tests,
+    split_valid_invalid,
+    write_quarantine,
+)
+__all__ = [
+    "ExplanationLinter",
+    "LintIssue",
+    "Validator",
+    "GateResult",
+    "evaluate_gate",
+    "QuarantineResult",
+    "add_quarantine_metadata",
+    "has_quarantine_tests",
+    "split_valid_invalid",
+    "write_quarantine",
+    "FKValidator",
+    "FKValidationResult",
+    "FKValidationReport",
+    "OrphanRecord",
+    "RelationshipConfig",
+    "RelationshipRegistry",
+    "get_orphan_records",
+    "validate_fk_on_load",
+    "parse_relationships_config",
+]
+__version__ = "1.3.0-alpha.1"