PyPI - dash-gov - Versions diffs - 0.1.1__py3-none-any.whl - Mend

dash-gov 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

dash_gov-0.1.1.dist-info/METADATA +64 -0
dash_gov-0.1.1.dist-info/RECORD +9 -0
dash_gov-0.1.1.dist-info/WHEEL +4 -0
dashgov/__init__.py +18 -0
dashgov/classifier.py +144 -0
dashgov/lineage.py +312 -0
dashgov/parser.py +201 -0
dashgov/scanner.py +117 -0
dashgov/ui.py +167 -0

dash_gov-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,64 @@
+Metadata-Version: 2.4
+Name: dash-gov
+Version: 0.1.1
+Summary: Data lineage and governance for Databricks — table/column lineage, classification, and a built-in notebook UI
+Project-URL: Homepage, https://github.com/dash-libs/dash-gov
+Author-email: Darshan Shah <darshan.innovation@gmail.com>
+License: Apache-2.0
+Keywords: data-catalog,databricks,governance,lineage,unity-catalog
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Information Technology
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.9
+Requires-Dist: ipywidgets>=8.0
+Requires-Dist: sqlglot>=23.0
+Provides-Extra: dev
+Requires-Dist: hatch; extra == 'dev'
+Requires-Dist: pdoc; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: pytest-cov; extra == 'dev'
+Requires-Dist: ruff; extra == 'dev'
+Description-Content-Type: text/markdown
+# DashGov — Databricks Library
+[![CI](https://github.com/dash-libs/dash-gov/actions/workflows/ci.yml/badge.svg)](https://github.com/dash-libs/dash-gov/actions)
+[![PyPI](https://img.shields.io/pypi/v/dash-gov)](https://pypi.org/project/dash-gov/)
+[![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
+Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
+## Installation
+```bash
+%pip install dash-gov
+```
+## Quick Start
+```python
+import dashgov
+dashgov.launch()   # Opens interactive UI in your Databricks notebook
+```
+## Part of Dashlibs
+| Library | Purpose |
+|---|---|
+| dash-dq | Data Quality |
+| dash-synthetic | Synthetic Data Generation |
+| dash-ml | ML Model Monitoring |
+| dash-ingest | Data Ingestion |
+| dash-gov | Data Governance |
+| dash-ontology | Ontology & Lineage for AI |
+## License
+Apache 2.0

dash_gov-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+dashgov/__init__.py,sha256=m2EPij-xHKLgcEec5WgqlCM__1CFnyeBF1CfqPoQzqk,574
+dashgov/classifier.py,sha256=rwdS54_Tugw4pw2hPHh-4XZgF79e5avNKVBunT8w6FE,4797
+dashgov/lineage.py,sha256=uE0UU-1FSC3qiQCED98k5H9pEXyXuf1ZMrq58nGsU9Y,11352
+dashgov/parser.py,sha256=IZ1GtW-d4rF9xChqhIrXkt2FcRBjIsFIQoo-d4M0Ua0,6574
+dashgov/scanner.py,sha256=5f-fEKDGqEP8rYa48JzEiAAlRHkdtjS4KMR_UfxyYQs,4070
+dashgov/ui.py,sha256=h04kuwbzirGTKyX28eggLpMH-mojmqCSaz-Dy2Fn_gE,7349
+dash_gov-0.1.1.dist-info/METADATA,sha256=YHEopPio4bD9UczTiWOBefEQspPDQwSim0rXbqA6fq0,2126
+dash_gov-0.1.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+dash_gov-0.1.1.dist-info/RECORD,,

dash_gov-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

dashgov/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""DashGov — Data lineage and governance for Databricks."""
+from dashgov.lineage import LineageGraph, build_lineage_graph, fetch_uc_lineage
+from dashgov.parser import parse_table_lineage, parse_column_lineage, parse_notebook_lineage
+from dashgov.classifier import classify_table, classify_all
+from dashgov.ui import launch
+__version__ = "0.1.1"
+__all__ = [
+    "LineageGraph",
+    "build_lineage_graph",
+    "fetch_uc_lineage",
+    "parse_table_lineage",
+    "parse_column_lineage",
+    "parse_notebook_lineage",
+    "classify_table",
+    "classify_all",
+    "launch",
+]

dashgov/classifier.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""
+Table role classification based on naming, schema shape, and lineage position.
+Roles:
+  entity      — root fact tables representing business objects (Customer, Order)
+  fact        — transactional / event tables with FK refs to entities
+  junction    — bridge tables expressing many:many relationships
+  aggregation — pre-computed summary / reporting tables
+  staging     — intermediate / temp tables in a transformation pipeline
+  unknown     — cannot be classified with confidence
+"""
+from __future__ import annotations
+# ── Name prefix/suffix patterns ───────────────────────────────────────────────
+_STAGING_PREFIXES   = {"stg_", "staging_", "tmp_", "temp_", "raw_", "src_", "landing_", "bronze_"}
+_DIMENSION_PREFIXES = {"dim_", "d_"}
+_FACT_PREFIXES      = {"fact_", "fct_", "f_"}
+_AGG_SUFFIXES       = {
+    "_agg", "_aggregated", "_summary", "_report",
+    "_metrics", "_stats", "_kpi", "_rollup", "_daily",
+    "_weekly", "_monthly", "_yearly",
+}
+_JUNCTION_SUFFIXES  = {"_map", "_mapping", "_xref", "_bridge", "_link", "_rel", "_assoc", "_pivot"}
+# Column names that strongly suggest a primary key
+_PK_PATTERNS = {"id", "pk", "key", "uuid", "guid"}
+# Column name endings that suggest a foreign key
+_FK_SUFFIXES = ("_id", "_pk", "_key", "_fk", "_ref", "_uuid")
+def _name_lower(table_name: str) -> str:
+    """Extract bare table name (no catalog/schema) and lowercase it."""
+    return table_name.split(".")[-1].lower()
+def _starts_with_any(name: str, prefixes: set[str]) -> bool:
+    return any(name.startswith(p) for p in prefixes)
+def _ends_with_any(name: str, suffixes: set | tuple) -> bool:
+    return any(name.endswith(s) for s in suffixes)
+def count_fk_columns(columns: list[dict]) -> int:
+    """Count columns that look like foreign keys."""
+    return sum(
+        1 for c in columns
+        if c.get("name", "").lower() != "id"
+        and _ends_with_any(c.get("name", "").lower(), _FK_SUFFIXES)
+    )
+def has_primary_key(columns: list[dict]) -> bool:
+    """True if there's a column that looks like a primary key."""
+    names = {c.get("name", "").lower() for c in columns}
+    return bool(names & _PK_PATTERNS) or any(
+        n == "id" or _ends_with_any(n, ("_id",)) and len(n) <= 10
+        for n in names
+    )
+def classify_table(
+    full_name: str,
+    columns: list[dict],
+    n_upstream: int = 0,
+    n_downstream: int = 0,
+) -> tuple[str, float]:
+    """
+    Classify a table's role.
+    Returns (role: str, confidence: float).
+    confidence is in [0.0, 1.0]:
+      >= 0.85 → strong signal (name prefix, junction shape)
+      0.60–0.84 → moderate signal (position in lineage + shape)
+      < 0.60 → weak / unknown
+    """
+    name = _name_lower(full_name)
+    n_cols = len(columns)
+    n_fk = count_fk_columns(columns)
+    has_pk = has_primary_key(columns)
+    # ── Staging ──
+    if _starts_with_any(name, _STAGING_PREFIXES):
+        return "staging", 0.90
+    # ── Aggregation ──
+    if _ends_with_any(name, _AGG_SUFFIXES):
+        return "aggregation", 0.90
+    if _starts_with_any(name, _FACT_PREFIXES) and n_upstream > 0:
+        return "aggregation", 0.75
+    # ── Dimension / Entity ──
+    if _starts_with_any(name, _DIMENSION_PREFIXES):
+        return "entity", 0.90
+    # ── Junction ──
+    if _ends_with_any(name, _JUNCTION_SUFFIXES):
+        return "junction", 0.88
+    if n_cols >= 2 and n_fk >= 2 and n_fk / max(n_cols, 1) >= 0.6:
+        # Mostly FK columns → junction/bridge table
+        return "junction", 0.80
+    # ── Entity ──
+    # Root source with a PK and meaningful columns
+    if n_upstream == 0 and has_pk and n_cols >= 3:
+        return "entity", 0.82
+    if n_upstream == 0 and n_cols >= 5:
+        return "entity", 0.65
+    # ── Fact ──
+    # Has upstream (transformed from somewhere) + FK columns
+    if n_upstream >= 1 and n_fk >= 1 and n_downstream >= 1:
+        return "fact", 0.70
+    if n_upstream >= 1 and n_fk >= 2:
+        return "fact", 0.65
+    # ── Aggregation by position ──
+    if n_upstream >= 2 and n_downstream == 0:
+        return "aggregation", 0.60
+    return "unknown", 0.40
+def classify_all(
+    tables: dict,   # {full_name: {"columns": [...], "role": ...}}
+    upstream_counts: dict[str, int],
+    downstream_counts: dict[str, int],
+) -> dict[str, tuple[str, float]]:
+    """
+    Classify every table in the graph.
+    Returns {full_name: (role, confidence)}.
+    """
+    return {
+        name: classify_table(
+            name,
+            info.get("columns", []),
+            upstream_counts.get(name, 0),
+            downstream_counts.get(name, 0),
+        )
+        for name, info in tables.items()
+    }

dashgov/lineage.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""
+Lineage graph — table-level and column-level data lineage.
+Works with plain Python dicts so it is fully testable without Spark or UC.
+Use fetch_uc_lineage() to pull live data from a Unity Catalog workspace.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from collections import deque
+from typing import Optional
+@dataclass
+class TableNode:
+    full_name: str          # catalog.schema.table
+    catalog: str
+    schema_name: str
+    table: str
+    columns: list[dict]     # [{"name": str, "type": str, "nullable": bool}]
+    role: str = "unknown"   # entity | fact | junction | aggregation | staging | unknown
+@dataclass
+class TableEdge:
+    source: str   # full table name
+    target: str   # full table name
+@dataclass
+class ColumnEdge:
+    source_table: str
+    source_column: str
+    target_table: str
+    target_column: str
+    transformation: Optional[str] = None  # SQL expression when known
+class LineageGraph:
+    """Directed acyclic graph of table and column lineage."""
+    def __init__(
+        self,
+        tables: dict[str, TableNode],
+        table_edges: list[TableEdge],
+        column_edges: list[ColumnEdge],
+    ):
+        self.tables = tables
+        self.table_edges = table_edges
+        self.column_edges = column_edges
+        # adjacency: source → {targets}
+        self._downstream: dict[str, set[str]] = {}
+        self._upstream: dict[str, set[str]] = {}
+        for e in table_edges:
+            self._downstream.setdefault(e.source, set()).add(e.target)
+            self._upstream.setdefault(e.target, set()).add(e.source)
+    # ── Table-level traversal ────────────────────────────────────────────────
+    def upstream_tables(self, table: str, depth: int = 1) -> list[str]:
+        """All tables that feed into *table*, up to *depth* hops."""
+        return self._bfs(table, self._upstream, depth)
+    def downstream_tables(self, table: str, depth: int = 1) -> list[str]:
+        """All tables that consume from *table*, up to *depth* hops."""
+        return self._bfs(table, self._downstream, depth)
+    def root_sources(self, table: str) -> list[str]:
+        """Tables with no upstream that eventually feed into *table*."""
+        visited, result = set(), []
+        stack = [table]
+        while stack:
+            t = stack.pop()
+            if t in visited:
+                continue
+            visited.add(t)
+            ups = list(self._upstream.get(t, []))
+            if not ups and t != table:
+                result.append(t)
+            stack.extend(ups)
+        return sorted(result)
+    def impact_analysis(self, table: str) -> dict:
+        """What breaks if *table* changes — full downstream tree."""
+        direct = sorted(self._downstream.get(table, []))
+        all_downstream = self._bfs(table, self._downstream, depth=999)
+        col_targets = {}
+        for ce in self.column_edges:
+            if ce.source_table == table:
+                col_targets.setdefault(ce.source_column, []).append(
+                    f"{ce.target_table}.{ce.target_column}"
+                )
+        return {
+            "table": table,
+            "direct_dependents": direct,
+            "all_downstream": all_downstream,
+            "affected_column_paths": col_targets,
+            "total_affected_tables": len(all_downstream),
+        }
+    # ── Column-level traversal ───────────────────────────────────────────────
+    def column_sources(self, table: str, column: str) -> list[ColumnEdge]:
+        """Edges that feed into *table.column*."""
+        return [
+            e for e in self.column_edges
+            if e.target_table == table and e.target_column == column
+        ]
+    def column_targets(self, table: str, column: str) -> list[ColumnEdge]:
+        """Edges that *table.column* feeds into."""
+        return [
+            e for e in self.column_edges
+            if e.source_table == table and e.source_column == column
+        ]
+    def column_lineage_chain(self, table: str, column: str) -> dict:
+        """Full upstream chain for a single column."""
+        visited, upstream = set(), []
+        stack = [(table, column)]
+        while stack:
+            t, c = stack.pop()
+            key = f"{t}.{c}"
+            if key in visited:
+                continue
+            visited.add(key)
+            for src in self.column_sources(t, c):
+                upstream.append({"table": src.source_table, "column": src.source_column})
+                stack.append((src.source_table, src.source_column))
+        return {
+            "table": table,
+            "column": column,
+            "upstream_columns": upstream,
+        }
+    # ── Export ───────────────────────────────────────────────────────────────
+    def to_dict(self) -> dict:
+        return {
+            "tables": {
+                k: {
+                    "full_name": v.full_name,
+                    "catalog": v.catalog,
+                    "schema_name": v.schema_name,
+                    "table": v.table,
+                    "columns": v.columns,
+                    "role": v.role,
+                }
+                for k, v in self.tables.items()
+            },
+            "table_edges": [
+                {"source": e.source, "target": e.target} for e in self.table_edges
+            ],
+            "column_edges": [
+                {
+                    "source_table": e.source_table,
+                    "source_column": e.source_column,
+                    "target_table": e.target_table,
+                    "target_column": e.target_column,
+                    "transformation": e.transformation,
+                }
+                for e in self.column_edges
+            ],
+        }
+    def summary(self) -> dict:
+        return {
+            "total_tables": len(self.tables),
+            "total_table_edges": len(self.table_edges),
+            "total_column_edges": len(self.column_edges),
+            "root_sources": [t for t in self.tables if t not in self._upstream],
+            "leaf_sinks": [t for t in self.tables if t not in self._downstream],
+        }
+    # ── Internal ─────────────────────────────────────────────────────────────
+    def _bfs(self, start: str, adj: dict, depth: int) -> list[str]:
+        visited, result = {start}, []
+        queue = deque([(start, 0)])
+        while queue:
+            node, d = queue.popleft()
+            if d >= depth:
+                continue
+            for neighbour in adj.get(node, []):
+                if neighbour not in visited:
+                    visited.add(neighbour)
+                    result.append(neighbour)
+                    queue.append((neighbour, d + 1))
+        return result
+# ── Constructors ─────────────────────────────────────────────────────────────
+def build_lineage_graph(
+    tables: list[dict],
+    table_edges: list[dict],
+    column_edges: list[dict],
+) -> LineageGraph:
+    """
+    Build a LineageGraph from plain dicts.
+    tables      — [{"full_name": str, "columns": [{name, type, nullable}], ...}]
+    table_edges — [{"source": str, "target": str}]
+    column_edges — [{"source_table", "source_column", "target_table", "target_column"}]
+    """
+    nodes: dict[str, TableNode] = {}
+    for t in tables:
+        full = t["full_name"]
+        parts = full.split(".")
+        cat = parts[0] if len(parts) >= 3 else ""
+        sch = parts[1] if len(parts) >= 3 else (parts[0] if len(parts) == 2 else "")
+        tbl = parts[-1]
+        nodes[full] = TableNode(
+            full_name=full,
+            catalog=cat,
+            schema_name=sch,
+            table=tbl,
+            columns=t.get("columns", []),
+            role=t.get("role", "unknown"),
+        )
+    t_edges = [TableEdge(e["source"], e["target"]) for e in table_edges]
+    c_edges = [
+        ColumnEdge(
+            source_table=e["source_table"],
+            source_column=e["source_column"],
+            target_table=e["target_table"],
+            target_column=e["target_column"],
+            transformation=e.get("transformation"),
+        )
+        for e in column_edges
+    ]
+    return LineageGraph(nodes, t_edges, c_edges)
+def fetch_uc_lineage(
+    table: str,
+    workspace_url: str,
+    token: str,
+    depth: int = 2,
+) -> dict:
+    """
+    Fetch table-level and column-level lineage from Unity Catalog REST API.
+    Returns a dict compatible with build_lineage_graph().
+    Requires workspace_url (https://...) and a Databricks PAT.
+    """
+    try:
+        import requests
+    except ImportError:
+        raise RuntimeError("requests is required: pip install requests")
+    headers = {"Authorization": f"Bearer {token}"}
+    base = workspace_url.rstrip("/")
+    visited_tables: set[str] = set()
+    table_edges: list[dict] = []
+    column_edges: list[dict] = []
+    queue = deque([table])
+    visited_tables.add(table)
+    for _ in range(depth):
+        next_queue: deque = deque()
+        while queue:
+            t = queue.popleft()
+            resp = requests.get(
+                f"{base}/api/2.0/lineage-tracking/table-lineages",
+                headers=headers,
+                params={"table_name": t},
+                timeout=15,
+            )
+            if resp.status_code != 200:
+                continue
+            data = resp.json()
+            for up in data.get("upstream_tables", []):
+                src = up.get("name", "")
+                if src and src not in visited_tables:
+                    visited_tables.add(src)
+                    table_edges.append({"source": src, "target": t})
+                    next_queue.append(src)
+            for down in data.get("downstream_tables", []):
+                tgt = down.get("name", "")
+                if tgt and tgt not in visited_tables:
+                    visited_tables.add(tgt)
+                    table_edges.append({"source": t, "target": tgt})
+                    next_queue.append(tgt)
+        queue = next_queue
+    # Column lineage for the root table
+    col_resp = requests.get(
+        f"{base}/api/2.0/lineage-tracking/column-lineages",
+        headers=headers,
+        params={"table_name": table},
+        timeout=15,
+    )
+    if col_resp.status_code == 200:
+        for col_data in col_resp.json().get("column_lineage", []):
+            tgt_col = col_data.get("name", "")
+            for up in col_data.get("upstream_columns", []):
+                column_edges.append({
+                    "source_table": up.get("table_name", ""),
+                    "source_column": up.get("name", ""),
+                    "target_table": table,
+                    "target_column": tgt_col,
+                })
+    tables_list = [{"full_name": t, "columns": []} for t in visited_tables]
+    return {
+        "tables": tables_list,
+        "table_edges": table_edges,
+        "column_edges": column_edges,
+    }

dashgov/parser.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""
+SQL-based lineage extraction.
+Parses CREATE TABLE AS SELECT, INSERT INTO SELECT, and plain SELECT
+statements to extract table-level and column-level lineage without
+requiring a live Unity Catalog connection.
+Requires sqlglot (pure Python, no Spark dependency).
+"""
+from __future__ import annotations
+def _sqlglot():
+    try:
+        import sqlglot
+        return sqlglot
+    except ImportError:
+        raise RuntimeError("sqlglot is required: pip install sqlglot")
+def parse_table_lineage(sql: str, dialect: str = "spark") -> dict:
+    """
+    Extract table-level lineage from a SQL statement.
+    Returns:
+        {
+          "target": str | None,       # the table being written to
+          "sources": [str, ...],      # tables being read from
+          "type": "ctas"|"insert"|"select"|"unknown"
+        }
+    """
+    sg = _sqlglot()
+    exp = sg.exp
+    try:
+        stmt = sg.parse_one(sql, dialect=dialect)
+    except Exception:
+        return {"target": None, "sources": [], "type": "unknown"}
+    def _full(tbl) -> str:
+        parts = [p for p in (tbl.catalog, tbl.db, tbl.name) if p]
+        return ".".join(parts) if parts else (tbl.name or "")
+    def _table_names(node) -> list[str]:
+        return [_full(t) for t in node.find_all(exp.Table) if t.name]
+    if isinstance(stmt, exp.Create):
+        tbl = stmt.find(exp.Table)
+        target_full = _full(tbl) if tbl else None
+        target_short = tbl.name if tbl else None
+        all_names = _table_names(stmt)
+        sources = [n for n in all_names if n != target_full]
+        return {"target": target_short, "sources": list(dict.fromkeys(sources)), "type": "ctas"}
+    if isinstance(stmt, exp.Insert):
+        tbl = stmt.find(exp.Table)
+        target_full = _full(tbl) if tbl else None
+        target_short = tbl.name if tbl else None
+        inner = stmt.find(sg.exp.Select)
+        if inner:
+            sources = [n for n in _table_names(inner) if n != target_full]
+        else:
+            sources = []
+        return {"target": target_short, "sources": list(dict.fromkeys(sources)), "type": "insert"}
+    if isinstance(stmt, (exp.Select, exp.Subquery)):
+        sources = list(dict.fromkeys(_table_names(stmt)))
+        return {"target": None, "sources": sources, "type": "select"}
+    return {"target": None, "sources": [], "type": "unknown"}
+def parse_column_lineage(
+    sql: str,
+    target_table: str,
+    dialect: str = "spark",
+) -> list[dict]:
+    """
+    Extract column-level lineage from a SQL statement.
+    Returns list of:
+        {
+          "target_column": str,
+          "source_table": str | None,
+          "source_column": str | None,
+          "expression": str | None,   # for computed columns
+        }
+    Only handles direct column references. Complex expressions
+    (aggregations, UDFs) are returned with expression set to the SQL text.
+    """
+    sg = _sqlglot()
+    exp = sg.exp
+    try:
+        stmt = sg.parse_one(sql, dialect=dialect)
+    except Exception:
+        return []
+    # Unwrap CREATE TABLE AS SELECT / INSERT INTO SELECT
+    select = stmt.find(exp.Select)
+    if select is None:
+        if isinstance(stmt, exp.Select):
+            select = stmt
+        else:
+            return []
+    # Build alias map: alias → real table name
+    alias_map: dict[str, str] = {}
+    for from_expr in select.find_all(exp.From):
+        tbl = from_expr.find(exp.Table)
+        if tbl:
+            alias_map[tbl.alias or tbl.name] = tbl.name
+    for join in select.find_all(exp.Join):
+        tbl = join.find(exp.Table)
+        if tbl:
+            alias_map[tbl.alias or tbl.name] = tbl.name
+    result = []
+    for sel in select.selects:
+        alias = sel.alias or (sel.name if hasattr(sel, "name") else None)
+        target_col = alias or str(sel)
+        if isinstance(sel, (exp.Column, exp.Alias)):
+            col_node = sel.find(exp.Column) if isinstance(sel, exp.Alias) else sel
+            if col_node:
+                tbl_alias = (
+                    col_node.table if hasattr(col_node, "table") else None
+                )
+                src_tbl = alias_map.get(tbl_alias, tbl_alias) if tbl_alias else None
+                src_col = col_node.name if hasattr(col_node, "name") else None
+                result.append({
+                    "target_table": target_table,
+                    "target_column": target_col,
+                    "source_table": src_tbl,
+                    "source_column": src_col,
+                    "expression": None,
+                })
+            else:
+                result.append({
+                    "target_table": target_table,
+                    "target_column": target_col,
+                    "source_table": None,
+                    "source_column": None,
+                    "expression": str(sel),
+                })
+        else:
+            result.append({
+                "target_table": target_table,
+                "target_column": target_col,
+                "source_table": None,
+                "source_column": None,
+                "expression": str(sel),
+            })
+    return result
+def parse_notebook_lineage(sql_cells: list[str], dialect: str = "spark") -> dict:
+    """
+    Parse multiple SQL cells from a notebook and build combined lineage.
+    Returns:
+        {
+          "table_edges": [{"source": str, "target": str}, ...],
+          "column_edges": [{...}, ...],
+          "statements": int,
+          "parsed": int,
+        }
+    """
+    table_edges: list[dict] = []
+    column_edges: list[dict] = []
+    parsed = 0
+    for cell in sql_cells:
+        cell = cell.strip()
+        if not cell:
+            continue
+        tl = parse_table_lineage(cell, dialect=dialect)
+        if tl["target"] and tl["sources"]:
+            parsed += 1
+            for src in tl["sources"]:
+                table_edges.append({"source": src, "target": tl["target"]})
+            cl = parse_column_lineage(cell, tl["target"], dialect=dialect)
+            for ce in cl:
+                if ce["source_table"] and ce["source_column"]:
+                    column_edges.append({
+                        "source_table": ce["source_table"],
+                        "source_column": ce["source_column"],
+                        "target_table": ce["target_table"],
+                        "target_column": ce["target_column"],
+                        "transformation": ce.get("expression"),
+                    })
+    return {
+        "table_edges": table_edges,
+        "column_edges": column_edges,
+        "statements": len(sql_cells),
+        "parsed": parsed,
+    }

dashgov/scanner.py ADDED Viewed

@@ -0,0 +1,117 @@
+from __future__ import annotations
+from typing import Optional
+import re
+PII_PATTERNS = {
+    "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
+    "phone": r"\+?\d[\d\s\-().]{7,}\d",
+    "credit_card": r"\b(?:\d[ -]?){13,16}\b",
+    "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
+    "passport": r"\b[A-Z]{1,2}\d{6,9}\b",
+    "national_id": r"\b\d{3}-\d{3}-\d{4}-\d\b",
+}
+SENSITIVITY_KEYWORDS = {
+    "HIGH": ["salary", "income", "password", "secret", "credit_card", "ssn", "passport",
+             "national_id", "emirates_id", "iban", "account_number"],
+    "MEDIUM": ["email", "phone", "address", "dob", "birth", "gender", "nationality"],
+    "LOW": ["name", "city", "country", "region", "department"],
+}
+class GovernanceScanner:
+    """
+    Scan Databricks tables for PII, classify sensitivity, and apply tags.
+    Usage::
+        scanner = GovernanceScanner(table="catalog.schema.customers")
+        report = scanner.scan()
+        report.display()
+        report.apply_tags()   # writes UC column tags
+    """
+    def __init__(self, df=None, table: str = None):
+        self._table = table
+        self._df = self._load(df, table)
+    def _load(self, df, table):
+        if df is not None:
+            return df
+        from pyspark.sql import SparkSession
+        return SparkSession.getActiveSession().table(table)
+    def scan(self, sample_rows: int = 1000) -> "GovReport":
+        findings = {}
+        schema = self._df.schema
+        sample = self._df.limit(sample_rows)
+        for field in schema.fields:
+            col_name = field.name
+            dtype = str(field.dataType)
+            sensitivity = self._infer_sensitivity(col_name)
+            pii_types = []
+            if "String" in dtype:
+                col_vals = [r[col_name] for r in sample.select(col_name).collect()
+                            if r[col_name] is not None]
+                pii_types = self._detect_pii(col_vals)
+            findings[col_name] = {
+                "dtype": dtype,
+                "sensitivity": sensitivity,
+                "pii_types": pii_types,
+                "has_pii": len(pii_types) > 0,
+            }
+        return GovReport(self._table, findings)
+    def _infer_sensitivity(self, col_name: str) -> str:
+        lower = col_name.lower()
+        for level, keywords in SENSITIVITY_KEYWORDS.items():
+            if any(kw in lower for kw in keywords):
+                return level
+        return "NONE"
+    def _detect_pii(self, values: list[str]) -> list[str]:
+        detected = set()
+        sample = values[:200]
+        for pii_type, pattern in PII_PATTERNS.items():
+            if any(re.search(pattern, str(v)) for v in sample):
+                detected.add(pii_type)
+        return list(detected)
+class GovReport:
+    def __init__(self, table: Optional[str], findings: dict):
+        self.table = table
+        self.findings = findings
+    def display(self):
+        print(f"Governance scan: {self.table or 'DataFrame'}")
+        print(f"{'Column':<30} {'Sensitivity':<12} {'PII Types'}")
+        print("-" * 65)
+        for col, info in self.findings.items():
+            pii = ", ".join(info["pii_types"]) or "—"
+            print(f"{col:<30} {info['sensitivity']:<12} {pii}")
+    def apply_tags(self):
+        """Write Unity Catalog column tags for sensitivity classification."""
+        if not self.table:
+            print("⚠️  No table name — cannot apply UC tags")
+            return
+        from pyspark.sql import SparkSession
+        spark = SparkSession.getActiveSession()
+        for col, info in self.findings.items():
+            if info["sensitivity"] != "NONE":
+                try:
+                    spark.sql(
+                        f"ALTER TABLE {self.table} ALTER COLUMN `{col}` "
+                        f"SET TAGS ('sensitivity' = '{info['sensitivity']}')"
+                    )
+                except Exception as e:
+                    print(f"  ⚠️  Could not tag {col}: {e}")
+        print(f"✅ Tags applied to {self.table}")
+    def to_dict(self) -> dict:
+        return self.findings

dashgov/ui.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""DashGov interactive UI for Databricks notebooks."""
+from __future__ import annotations
+def _lineage_html(graph_dict: dict, focus_table: str = "") -> str:
+    """Render a lineage graph as a simple HTML DAG (upstream → focus → downstream)."""
+    tables = graph_dict.get("tables", {})
+    edges = graph_dict.get("table_edges", [])
+    upstream = {e["source"] for e in edges if e["target"] == focus_table}
+    downstream = {e["target"] for e in edges if e["source"] == focus_table}
+    role_colors = {
+        "entity": "#2563eb",
+        "fact": "#16a34a",
+        "junction": "#7c3aed",
+        "aggregation": "#d97706",
+        "staging": "#6b7280",
+        "unknown": "#9ca3af",
+    }
+    def _box(name: str, pos: str) -> str:
+        short = name.split(".")[-1]
+        role = tables.get(name, {}).get("role", "unknown")
+        color = role_colors.get(role, "#9ca3af")
+        border = "3px solid #1d4ed8" if pos == "focus" else "1px solid #d1d5db"
+        bg = "#eff6ff" if pos == "focus" else "#f9fafb"
+        return (
+            f"<div style='padding:8px 12px;border:{border};border-radius:6px;"
+            f"background:{bg};color:#111;font-size:12px;margin:4px;display:inline-block'>"
+            f"<span style='color:{color};font-weight:600'>{short}</span>"
+            f"<br/><span style='font-size:10px;color:#6b7280'>{role}</span></div>"
+        )
+    up_html = "".join(_box(t, "up") for t in sorted(upstream))
+    focus_html = _box(focus_table, "focus") if focus_table else ""
+    down_html = "".join(_box(t, "down") for t in sorted(downstream))
+    arrow = "<div style='font-size:20px;color:#9ca3af;margin:0 8px'>→</div>"
+    return (
+        "<div style='display:flex;align-items:center;flex-wrap:wrap;gap:4px;"
+        "font-family:monospace;padding:12px;background:#fff;border-radius:8px;"
+        "border:1px solid #e5e7eb'>"
+        f"<div style='display:flex;flex-direction:column'>{up_html}</div>"
+        f"{arrow if upstream else ''}"
+        f"{focus_html}"
+        f"{arrow if downstream else ''}"
+        f"<div style='display:flex;flex-direction:column'>{down_html}</div>"
+        "</div>"
+    )
+def launch():
+    try:
+        import ipywidgets as w
+        from IPython.display import display
+    except ImportError:
+        raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets")
+    import dashui
+    # ── SQL parser ────────────────────────────────────────────────────────────
+    sql_input = w.Textarea(
+        description="SQL:",
+        placeholder="Paste CREATE TABLE AS SELECT or INSERT INTO SELECT ...",
+        layout=w.Layout(width="100%", height="120px"),
+    )
+    dialect_toggle = w.ToggleButtons(
+        options=["spark", "snowflake", "bigquery", "trino"],
+        description="Dialect:",
+        value="spark",
+    )
+    parse_btn = dashui.action_button("Parse Lineage from SQL", style="info", emoji="🔍")
+    parse_output = dashui.output_panel()
+    def on_parse(b):
+        with parse_output:
+            parse_output.clear_output()
+            sql = sql_input.value.strip()
+            if not sql:
+                print("⚠️  Paste a SQL statement above")
+                return
+            try:
+                from dashgov.parser import parse_table_lineage, parse_column_lineage
+                tl = parse_table_lineage(sql, dialect=dialect_toggle.value)
+                print(f"Type    : {tl['type']}")
+                print(f"Target  : {tl['target'] or '—'}")
+                print(f"Sources : {', '.join(tl['sources']) or '—'}")
+                if tl["target"]:
+                    cl = parse_column_lineage(sql, tl["target"], dialect=dialect_toggle.value)
+                    if cl:
+                        print("\nColumn lineage:")
+                        for c in cl:
+                            src = (
+                                f"{c['source_table']}.{c['source_column']}"
+                                if c["source_table"] else c.get("expression", "?")
+                            )
+                            print(f"  {src:40s} → {c['target_column']}")
+            except Exception as e:
+                print(f"❌ {e}")
+    parse_btn.on_click(on_parse)
+    # ── UC live lineage ───────────────────────────────────────────────────────
+    uc_workspace = w.Text(
+        description="Workspace URL:",
+        placeholder="https://adb-xxx.azuredatabricks.net",
+    )
+    uc_token = w.Password(description="Token:", placeholder="dapixxxxxxxx")
+    uc_table = w.Text(description="Table:", placeholder="catalog.schema.table")
+    uc_depth = w.IntSlider(description="Depth:", value=2, min=1, max=5)
+    uc_btn = dashui.action_button("Fetch UC Lineage", style="success", emoji="🌐")
+    uc_output = dashui.output_panel()
+    lineage_viz = w.HTML(value="")
+    def on_uc_fetch(b):
+        with uc_output:
+            uc_output.clear_output()
+            url = uc_workspace.value.strip()
+            tok = uc_token.value.strip()
+            tbl = uc_table.value.strip()
+            if not (url and tok and tbl):
+                print("⚠️  Fill in workspace URL, token, and table name")
+                return
+            try:
+                from dashgov.lineage import fetch_uc_lineage, build_lineage_graph
+                raw = fetch_uc_lineage(tbl, url, tok, depth=uc_depth.value)
+                graph = build_lineage_graph(
+                    raw["tables"], raw["table_edges"], raw["column_edges"]
+                )
+                s = graph.summary()
+                print(f"Tables : {s['total_tables']}")
+                print(f"Edges  : {s['total_table_edges']} table, {s['total_column_edges']} column")
+                print(f"Roots  : {', '.join(s['root_sources']) or '—'}")
+                print(f"Sinks  : {', '.join(s['leaf_sinks']) or '—'}")
+                lineage_viz.value = _lineage_html(graph.to_dict(), focus_table=tbl)
+                imp = graph.impact_analysis(tbl)
+                if imp["all_downstream"]:
+                    print(f"\nImpact if {tbl} changes:")
+                    for t in imp["all_downstream"]:
+                        print(f"  ↓ {t}")
+            except Exception as e:
+                print(f"❌ {e}")
+    uc_btn.on_click(on_uc_fetch)
+    ui = dashui.card([
+        dashui.header("DashGov — Data Lineage & Governance", library="dashgov", emoji="🔗"),
+        dashui.section("Step 1: Parse lineage from SQL"),
+        dashui.html(
+            "<div style='font-size:12px;color:#666;margin-bottom:4px'>"
+            "Paste a CREATE TABLE AS SELECT or INSERT INTO SELECT to extract "
+            "table and column lineage without a UC connection.</div>"
+        ),
+        sql_input, dialect_toggle, parse_btn, parse_output,
+        dashui.section("Step 2: Fetch live lineage from Unity Catalog"),
+        dashui.html(
+            "<div style='font-size:12px;color:#666;margin-bottom:4px'>"
+            "Requires a Databricks workspace URL and personal access token.</div>"
+        ),
+        w.HBox([uc_workspace, uc_token]),
+        w.HBox([uc_table, uc_depth]),
+        uc_btn, uc_output, lineage_viz,
+    ])
+    display(ui)