PyPI - odibi - Versions diffs - 2.5.0__py3-none-any.whl - Mend

odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

odibi/__init__.py +32 -0
odibi/__main__.py +8 -0
odibi/catalog.py +3011 -0
odibi/cli/__init__.py +11 -0
odibi/cli/__main__.py +6 -0
odibi/cli/catalog.py +553 -0
odibi/cli/deploy.py +69 -0
odibi/cli/doctor.py +161 -0
odibi/cli/export.py +66 -0
odibi/cli/graph.py +150 -0
odibi/cli/init_pipeline.py +242 -0
odibi/cli/lineage.py +259 -0
odibi/cli/main.py +215 -0
odibi/cli/run.py +98 -0
odibi/cli/schema.py +208 -0
odibi/cli/secrets.py +232 -0
odibi/cli/story.py +379 -0
odibi/cli/system.py +132 -0
odibi/cli/test.py +286 -0
odibi/cli/ui.py +31 -0
odibi/cli/validate.py +39 -0
odibi/config.py +3541 -0
odibi/connections/__init__.py +9 -0
odibi/connections/azure_adls.py +499 -0
odibi/connections/azure_sql.py +709 -0
odibi/connections/base.py +28 -0
odibi/connections/factory.py +322 -0
odibi/connections/http.py +78 -0
odibi/connections/local.py +119 -0
odibi/connections/local_dbfs.py +61 -0
odibi/constants.py +17 -0
odibi/context.py +528 -0
odibi/diagnostics/__init__.py +12 -0
odibi/diagnostics/delta.py +520 -0
odibi/diagnostics/diff.py +169 -0
odibi/diagnostics/manager.py +171 -0
odibi/engine/__init__.py +20 -0
odibi/engine/base.py +334 -0
odibi/engine/pandas_engine.py +2178 -0
odibi/engine/polars_engine.py +1114 -0
odibi/engine/registry.py +54 -0
odibi/engine/spark_engine.py +2362 -0
odibi/enums.py +7 -0
odibi/exceptions.py +297 -0
odibi/graph.py +426 -0
odibi/introspect.py +1214 -0
odibi/lineage.py +511 -0
odibi/node.py +3341 -0
odibi/orchestration/__init__.py +0 -0
odibi/orchestration/airflow.py +90 -0
odibi/orchestration/dagster.py +77 -0
odibi/patterns/__init__.py +24 -0
odibi/patterns/aggregation.py +599 -0
odibi/patterns/base.py +94 -0
odibi/patterns/date_dimension.py +423 -0
odibi/patterns/dimension.py +696 -0
odibi/patterns/fact.py +748 -0
odibi/patterns/merge.py +128 -0
odibi/patterns/scd2.py +148 -0
odibi/pipeline.py +2382 -0
odibi/plugins.py +80 -0
odibi/project.py +581 -0
odibi/references.py +151 -0
odibi/registry.py +246 -0
odibi/semantics/__init__.py +71 -0
odibi/semantics/materialize.py +392 -0
odibi/semantics/metrics.py +361 -0
odibi/semantics/query.py +743 -0
odibi/semantics/runner.py +430 -0
odibi/semantics/story.py +507 -0
odibi/semantics/views.py +432 -0
odibi/state/__init__.py +1203 -0
odibi/story/__init__.py +55 -0
odibi/story/doc_story.py +554 -0
odibi/story/generator.py +1431 -0
odibi/story/lineage.py +1043 -0
odibi/story/lineage_utils.py +324 -0
odibi/story/metadata.py +608 -0
odibi/story/renderers.py +453 -0
odibi/story/templates/run_story.html +2520 -0
odibi/story/themes.py +216 -0
odibi/testing/__init__.py +13 -0
odibi/testing/assertions.py +75 -0
odibi/testing/fixtures.py +85 -0
odibi/testing/source_pool.py +277 -0
odibi/transformers/__init__.py +122 -0
odibi/transformers/advanced.py +1472 -0
odibi/transformers/delete_detection.py +610 -0
odibi/transformers/manufacturing.py +1029 -0
odibi/transformers/merge_transformer.py +778 -0
odibi/transformers/relational.py +675 -0
odibi/transformers/scd.py +579 -0
odibi/transformers/sql_core.py +1356 -0
odibi/transformers/validation.py +165 -0
odibi/ui/__init__.py +0 -0
odibi/ui/app.py +195 -0
odibi/utils/__init__.py +66 -0
odibi/utils/alerting.py +667 -0
odibi/utils/config_loader.py +343 -0
odibi/utils/console.py +231 -0
odibi/utils/content_hash.py +202 -0
odibi/utils/duration.py +43 -0
odibi/utils/encoding.py +102 -0
odibi/utils/extensions.py +28 -0
odibi/utils/hashing.py +61 -0
odibi/utils/logging.py +203 -0
odibi/utils/logging_context.py +740 -0
odibi/utils/progress.py +429 -0
odibi/utils/setup_helpers.py +302 -0
odibi/utils/telemetry.py +140 -0
odibi/validation/__init__.py +62 -0
odibi/validation/engine.py +765 -0
odibi/validation/explanation_linter.py +155 -0
odibi/validation/fk.py +547 -0
odibi/validation/gate.py +252 -0
odibi/validation/quarantine.py +605 -0
odibi/writers/__init__.py +15 -0
odibi/writers/sql_server_writer.py +2081 -0
odibi-2.5.0.dist-info/METADATA +255 -0
odibi-2.5.0.dist-info/RECORD +124 -0
odibi-2.5.0.dist-info/WHEEL +5 -0
odibi-2.5.0.dist-info/entry_points.txt +2 -0
odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
odibi-2.5.0.dist-info/top_level.txt +1 -0

odibi/lineage.py ADDED Viewed

@@ -0,0 +1,511 @@
+import logging
+import os
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Union
+try:
+    from openlineage.client import OpenLineageClient
+    from openlineage.client.facet import (
+        DocumentationJobFacet,
+        ErrorMessageRunFacet,
+        NominalTimeRunFacet,
+        ParentRunFacet,
+        ProcessingEngineRunFacet,
+        SchemaDatasetFacet,
+        SchemaField,
+        SourceCodeJobFacet,
+    )
+    from openlineage.client.run import (
+        InputDataset,
+        Job,
+        OutputDataset,
+        Run,
+        RunEvent,
+        RunState,
+    )
+    HAS_OPENLINEAGE = True
+except ImportError:
+    HAS_OPENLINEAGE = False
+    InputDataset = Any
+    OutputDataset = Any
+    RunEvent = Any  # Also needed for type hints? no, I didn't use it in signature
+from odibi.config import LineageConfig, NodeConfig, PipelineConfig
+from odibi.node import NodeResult
+logger = logging.getLogger(__name__)
+class OpenLineageAdapter:
+    """Adapter for OpenLineage integration."""
+    def __init__(self, config: Optional[LineageConfig] = None):
+        self.enabled = HAS_OPENLINEAGE and config is not None
+        if not HAS_OPENLINEAGE:
+            logger.debug("OpenLineage not installed. Skipping lineage.")
+            return
+        if not config:
+            self.enabled = False
+            return
+        url = config.url or os.getenv("OPENLINEAGE_URL")
+        api_key = config.api_key or os.getenv("OPENLINEAGE_API_KEY")
+        if not url:
+            self.enabled = False
+            return
+        try:
+            self.client = OpenLineageClient(url=url, api_key=api_key)
+            self.namespace = config.namespace
+            self.pipeline_run_id = None
+            self.pipeline_name = None
+        except Exception as e:
+            logger.warning(f"Failed to initialize OpenLineage client: {e}", exc_info=True)
+            self.enabled = False
+    def emit_pipeline_start(self, pipeline_config: PipelineConfig) -> str:
+        """Emit pipeline start event (Parent Run)."""
+        if not self.enabled:
+            return str(uuid.uuid4())
+        try:
+            self.pipeline_run_id = str(uuid.uuid4())
+            self.pipeline_name = pipeline_config.pipeline
+            event_time = datetime.now(timezone.utc).isoformat()
+            run = Run(
+                runId=self.pipeline_run_id,
+                facets={
+                    "nominalTime": NominalTimeRunFacet(
+                        nominalStartTime=event_time, nominalEndTime=None
+                    ),
+                    "processing_engine": ProcessingEngineRunFacet(
+                        version=__import__("odibi").__version__,
+                        name="Odibi",
+                        openlineageAdapterVersion=__import__("odibi").__version__,
+                    ),
+                },
+            )
+            job = Job(
+                namespace=self.namespace,
+                name=pipeline_config.pipeline,
+                facets={
+                    "documentation": DocumentationJobFacet(
+                        description=pipeline_config.description or "Odibi Pipeline"
+                    )
+                },
+            )
+            event = RunEvent(
+                eventType=RunState.START,
+                eventTime=event_time,
+                run=run,
+                job=job,
+                inputs=[],
+                outputs=[],
+                producer="https://github.com/henryodibi11/Odibi",
+            )
+            self.client.emit(event)
+            return self.pipeline_run_id
+        except Exception as e:
+            logger.warning(f"Failed to emit OpenLineage pipeline start: {e}", exc_info=True)
+            return str(uuid.uuid4())
+    def emit_pipeline_complete(self, pipeline_config: PipelineConfig, results: Any):
+        """Emit pipeline completion event."""
+        if not self.enabled or not self.pipeline_run_id:
+            return
+        try:
+            event_time = datetime.now(timezone.utc).isoformat()
+            # Determine success based on results
+            success = not results.failed
+            event_type = RunState.COMPLETE if success else RunState.FAIL
+            run_facets = {}
+            if not success:
+                run_facets["errorMessage"] = ErrorMessageRunFacet(
+                    message=f"Pipeline failed with nodes: {results.failed}",
+                    programmingLanguage="python",
+                )
+            run = Run(runId=self.pipeline_run_id, facets=run_facets)
+            job = Job(namespace=self.namespace, name=pipeline_config.pipeline)
+            event = RunEvent(
+                eventType=event_type,
+                eventTime=event_time,
+                run=run,
+                job=job,
+                inputs=[],
+                outputs=[],
+                producer="https://github.com/henryodibi11/Odibi",
+            )
+            self.client.emit(event)
+        except Exception as e:
+            logger.warning(f"Failed to emit OpenLineage pipeline complete: {e}", exc_info=True)
+    def emit_node_start(self, config: NodeConfig, parent_run_id: str):
+        """Emit node start event."""
+        if not self.enabled:
+            return str(uuid.uuid4())
+        try:
+            run_id = str(uuid.uuid4())
+            event_time = datetime.now(timezone.utc).isoformat()
+            # Resolve Inputs
+            inputs = []
+            if config.read:
+                # We need connection obj to resolve path?
+                # Without access to instantiated connections here, we do best effort with names
+                # Ideally we pass connections to adapter, but adapter is initialized once.
+                # We can accept connections as arg? Or just use string names for now.
+                # Let's use string logic for now.
+                ds = self._create_dataset_from_config(config.read, is_input=True)
+                if ds:
+                    inputs.append(ds)
+            elif config.depends_on:
+                # Dependency inputs? Not external datasets usually, but internal.
+                # OpenLineage tracks DATASETS. Internal DFs are ephemeral.
+                pass
+            run_facets = {
+                "parent": ParentRunFacet(
+                    run={"runId": parent_run_id},
+                    job={
+                        "namespace": self.namespace,
+                        "name": self.pipeline_name or "unknown_pipeline",
+                    },
+                )
+            }
+            job_facets = {
+                "sourceCode": SourceCodeJobFacet(
+                    language="python",
+                    source_code=(
+                        str(config.model_dump_json())
+                        if hasattr(config, "model_dump_json")
+                        else str(config.model_dump())
+                    ),
+                )
+            }
+            if config.description:
+                job_facets["documentation"] = DocumentationJobFacet(description=config.description)
+            run = Run(runId=run_id, facets=run_facets)
+            job = Job(
+                namespace=self.namespace,
+                name=f"{self.pipeline_name}.{config.name}",
+                facets=job_facets,
+            )
+            event = RunEvent(
+                eventType=RunState.START,
+                eventTime=event_time,
+                run=run,
+                job=job,
+                inputs=inputs,
+                outputs=[],
+                producer="https://github.com/henryodibi11/Odibi",
+            )
+            self.client.emit(event)
+            return run_id
+        except Exception as e:
+            logger.warning(f"Failed to emit OpenLineage node start: {e}")
+            return str(uuid.uuid4())
+    def emit_node_complete(self, config: NodeConfig, result: NodeResult, run_id: str):
+        """Emit node completion event."""
+        if not self.enabled or not run_id:
+            return
+        try:
+            event_time = datetime.now(timezone.utc).isoformat()
+            event_type = RunState.COMPLETE if result.success else RunState.FAIL
+            outputs = []
+            if config.write:
+                ds = self._create_dataset_from_config(
+                    config.write, is_input=False, schema=result.result_schema
+                )
+                if ds:
+                    outputs.append(ds)
+            run_facets = {}
+            if not result.success and result.error:
+                run_facets["errorMessage"] = ErrorMessageRunFacet(
+                    message=str(result.error), programmingLanguage="python"
+                )
+            run = Run(runId=run_id, facets=run_facets)
+            job = Job(namespace=self.namespace, name=f"{self.pipeline_name}.{config.name}")
+            event = RunEvent(
+                eventType=event_type,
+                eventTime=event_time,
+                run=run,
+                job=job,
+                inputs=[],
+                outputs=outputs,
+                producer="https://github.com/henryodibi11/Odibi",
+            )
+            self.client.emit(event)
+        except Exception as e:
+            logger.warning(f"Failed to emit OpenLineage node complete: {e}")
+    def _create_dataset_from_config(
+        self, config_op: Any, is_input: bool, schema: Any = None
+    ) -> Optional[Union[InputDataset, OutputDataset]]:
+        """Create OpenLineage Dataset from Read/Write config."""
+        # Best effort naming
+        try:
+            conn_name = config_op.connection
+            name = config_op.path or config_op.table or "unknown"
+            # Namespace strategy: connection name usually maps to a storage account/container
+            namespace = f"{self.namespace}.{conn_name}"
+            facets = {}
+            if schema:
+                fields = []
+                # schema is usually a dict {col: type}
+                if isinstance(schema, dict):
+                    for col, dtype in schema.items():
+                        fields.append(SchemaField(name=col, type=str(dtype)))
+                if fields:
+                    facets["schema"] = SchemaDatasetFacet(fields=fields)
+            if is_input:
+                return InputDataset(namespace=namespace, name=name, facets=facets)
+            else:
+                return OutputDataset(namespace=namespace, name=name, facets=facets)
+        except Exception:
+            return None
+class LineageTracker:
+    """Track cross-pipeline lineage relationships.
+    This class provides table-level lineage tracking across pipelines,
+    storing relationships in the System Catalog for later querying.
+    Example:
+        ```python
+        tracker = LineageTracker(catalog)
+        tracker.record_lineage(
+            read_config=node.read,
+            write_config=node.write,
+            pipeline="silver_pipeline",
+            node="process_customers",
+            run_id="run-123",
+            connections=connections
+        )
+        ```
+    """
+    def __init__(self, catalog: Optional[Any] = None):
+        """Initialize LineageTracker.
+        Args:
+            catalog: CatalogManager instance for persistence
+        """
+        self.catalog = catalog
+    def record_lineage(
+        self,
+        read_config: Optional[Any],
+        write_config: Optional[Any],
+        pipeline: str,
+        node: str,
+        run_id: str,
+        connections: Dict[str, Any],
+    ) -> None:
+        """Record lineage from node's read/write config.
+        Args:
+            read_config: ReadConfig from the node
+            write_config: WriteConfig from the node
+            pipeline: Pipeline name
+            node: Node name
+            run_id: Execution run ID
+            connections: Dictionary of connection configurations
+        """
+        if not self.catalog or not write_config:
+            return
+        target_table = self._resolve_table_path(write_config, connections)
+        if not target_table:
+            return
+        if read_config:
+            source_table = self._resolve_table_path(read_config, connections)
+            if source_table:
+                self.catalog.record_lineage(
+                    source_table=source_table,
+                    target_table=target_table,
+                    target_pipeline=pipeline,
+                    target_node=node,
+                    run_id=run_id,
+                )
+    def record_dependency_lineage(
+        self,
+        depends_on: List[str],
+        write_config: Optional[Any],
+        pipeline: str,
+        node: str,
+        run_id: str,
+        node_outputs: Dict[str, str],
+        connections: Dict[str, Any],
+    ) -> None:
+        """Record lineage from node dependencies.
+        Args:
+            depends_on: List of dependency node names
+            write_config: WriteConfig from the node
+            pipeline: Pipeline name
+            node: Node name
+            run_id: Execution run ID
+            node_outputs: Map of node names to their output table paths
+            connections: Dictionary of connection configurations
+        """
+        if not self.catalog or not write_config:
+            return
+        target_table = self._resolve_table_path(write_config, connections)
+        if not target_table:
+            return
+        for dep_node in depends_on:
+            source_table = node_outputs.get(dep_node)
+            if source_table:
+                self.catalog.record_lineage(
+                    source_table=source_table,
+                    target_table=target_table,
+                    source_pipeline=pipeline,
+                    source_node=dep_node,
+                    target_pipeline=pipeline,
+                    target_node=node,
+                    run_id=run_id,
+                )
+    def _resolve_table_path(
+        self,
+        config: Any,
+        connections: Dict[str, Any],
+    ) -> Optional[str]:
+        """Resolve full table path from read/write config.
+        Args:
+            config: ReadConfig or WriteConfig
+            connections: Dictionary of connection configurations
+        Returns:
+            Full table path (e.g., "connection/path" or "catalog.schema.table")
+        """
+        try:
+            conn_name = config.connection
+            path = getattr(config, "path", None)
+            table = getattr(config, "table", None)
+            if table:
+                conn = connections.get(conn_name)
+                if conn and hasattr(conn, "schema_name"):
+                    catalog = getattr(conn, "catalog", "")
+                    schema = conn.schema_name
+                    return f"{catalog}.{schema}.{table}" if catalog else f"{schema}.{table}"
+                return f"{conn_name}.{table}"
+            if path:
+                return f"{conn_name}/{path}"
+            return None
+        except Exception:
+            return None
+    def get_upstream(self, table_path: str, depth: int = 3) -> List[Dict]:
+        """Get all upstream sources for a table.
+        Args:
+            table_path: Table to trace upstream from
+            depth: Maximum depth to traverse (default: 3)
+        Returns:
+            List of upstream lineage records with depth information
+        """
+        if not self.catalog:
+            return []
+        return self.catalog.get_upstream(table_path, depth)
+    def get_downstream(self, table_path: str, depth: int = 3) -> List[Dict]:
+        """Get all downstream consumers of a table.
+        Args:
+            table_path: Table to trace downstream from
+            depth: Maximum depth to traverse (default: 3)
+        Returns:
+            List of downstream lineage records with depth information
+        """
+        if not self.catalog:
+            return []
+        return self.catalog.get_downstream(table_path, depth)
+    def get_impact_analysis(self, table_path: str, depth: int = 3) -> Dict[str, Any]:
+        """Perform impact analysis for a table.
+        Args:
+            table_path: Table to analyze impact for
+            depth: Maximum depth to traverse (default: 3)
+        Returns:
+            Dict containing:
+            - affected_tables: list of downstream tables
+            - affected_pipelines: list of affected pipelines
+            - total_depth: maximum depth reached
+        """
+        downstream = self.get_downstream(table_path, depth)
+        affected_tables = set()
+        affected_pipelines = set()
+        max_depth = 0
+        for record in downstream:
+            target = record.get("target_table")
+            if target:
+                affected_tables.add(target)
+            pipeline = record.get("target_pipeline")
+            if pipeline:
+                affected_pipelines.add(pipeline)
+            record_depth = record.get("depth", 0)
+            if record_depth > max_depth:
+                max_depth = record_depth
+        return {
+            "table": table_path,
+            "affected_tables": list(affected_tables),
+            "affected_pipelines": list(affected_pipelines),
+            "total_depth": max_depth,
+            "downstream_count": len(downstream),
+        }