dash-gov 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: dash-gov
3
+ Version: 0.1.1
4
+ Summary: Data lineage and governance for Databricks — table/column lineage, classification, and a built-in notebook UI
5
+ Project-URL: Homepage, https://github.com/dash-libs/dash-gov
6
+ Author-email: Darshan Shah <darshan.innovation@gmail.com>
7
+ License: Apache-2.0
8
+ Keywords: data-catalog,databricks,governance,lineage,unity-catalog
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Information Technology
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.9
20
+ Requires-Dist: ipywidgets>=8.0
21
+ Requires-Dist: sqlglot>=23.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: hatch; extra == 'dev'
24
+ Requires-Dist: pdoc; extra == 'dev'
25
+ Requires-Dist: pytest; extra == 'dev'
26
+ Requires-Dist: pytest-cov; extra == 'dev'
27
+ Requires-Dist: ruff; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # DashGov — Databricks Library
31
+
32
+ [![CI](https://github.com/dash-libs/dash-gov/actions/workflows/ci.yml/badge.svg)](https://github.com/dash-libs/dash-gov/actions)
33
+ [![PyPI](https://img.shields.io/pypi/v/dash-gov)](https://pypi.org/project/dash-gov/)
34
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
35
+
36
+ Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ %pip install dash-gov
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import dashgov
48
+ dashgov.launch() # Opens interactive UI in your Databricks notebook
49
+ ```
50
+
51
+ ## Part of Dashlibs
52
+
53
+ | Library | Purpose |
54
+ |---|---|
55
+ | dash-dq | Data Quality |
56
+ | dash-synthetic | Synthetic Data Generation |
57
+ | dash-ml | ML Model Monitoring |
58
+ | dash-ingest | Data Ingestion |
59
+ | dash-gov | Data Governance |
60
+ | dash-ontology | Ontology & Lineage for AI |
61
+
62
+ ## License
63
+
64
+ Apache 2.0
@@ -0,0 +1,9 @@
1
+ dashgov/__init__.py,sha256=m2EPij-xHKLgcEec5WgqlCM__1CFnyeBF1CfqPoQzqk,574
2
+ dashgov/classifier.py,sha256=rwdS54_Tugw4pw2hPHh-4XZgF79e5avNKVBunT8w6FE,4797
3
+ dashgov/lineage.py,sha256=uE0UU-1FSC3qiQCED98k5H9pEXyXuf1ZMrq58nGsU9Y,11352
4
+ dashgov/parser.py,sha256=IZ1GtW-d4rF9xChqhIrXkt2FcRBjIsFIQoo-d4M0Ua0,6574
5
+ dashgov/scanner.py,sha256=5f-fEKDGqEP8rYa48JzEiAAlRHkdtjS4KMR_UfxyYQs,4070
6
+ dashgov/ui.py,sha256=h04kuwbzirGTKyX28eggLpMH-mojmqCSaz-Dy2Fn_gE,7349
7
+ dash_gov-0.1.1.dist-info/METADATA,sha256=YHEopPio4bD9UczTiWOBefEQspPDQwSim0rXbqA6fq0,2126
8
+ dash_gov-0.1.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ dash_gov-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
dashgov/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ """DashGov — Data lineage and governance for Databricks."""
2
+ from dashgov.lineage import LineageGraph, build_lineage_graph, fetch_uc_lineage
3
+ from dashgov.parser import parse_table_lineage, parse_column_lineage, parse_notebook_lineage
4
+ from dashgov.classifier import classify_table, classify_all
5
+ from dashgov.ui import launch
6
+
7
+ __version__ = "0.1.1"
8
+ __all__ = [
9
+ "LineageGraph",
10
+ "build_lineage_graph",
11
+ "fetch_uc_lineage",
12
+ "parse_table_lineage",
13
+ "parse_column_lineage",
14
+ "parse_notebook_lineage",
15
+ "classify_table",
16
+ "classify_all",
17
+ "launch",
18
+ ]
dashgov/classifier.py ADDED
@@ -0,0 +1,144 @@
1
+ """
2
+ Table role classification based on naming, schema shape, and lineage position.
3
+
4
+ Roles:
5
+ entity — root fact tables representing business objects (Customer, Order)
6
+ fact — transactional / event tables with FK refs to entities
7
+ junction — bridge tables expressing many:many relationships
8
+ aggregation — pre-computed summary / reporting tables
9
+ staging — intermediate / temp tables in a transformation pipeline
10
+ unknown — cannot be classified with confidence
11
+ """
12
+ from __future__ import annotations
13
+
14
+ # ── Name prefix/suffix patterns ───────────────────────────────────────────────
15
+
16
+ _STAGING_PREFIXES = {"stg_", "staging_", "tmp_", "temp_", "raw_", "src_", "landing_", "bronze_"}
17
+ _DIMENSION_PREFIXES = {"dim_", "d_"}
18
+ _FACT_PREFIXES = {"fact_", "fct_", "f_"}
19
+ _AGG_SUFFIXES = {
20
+ "_agg", "_aggregated", "_summary", "_report",
21
+ "_metrics", "_stats", "_kpi", "_rollup", "_daily",
22
+ "_weekly", "_monthly", "_yearly",
23
+ }
24
+ _JUNCTION_SUFFIXES = {"_map", "_mapping", "_xref", "_bridge", "_link", "_rel", "_assoc", "_pivot"}
25
+
26
+ # Column names that strongly suggest a primary key
27
+ _PK_PATTERNS = {"id", "pk", "key", "uuid", "guid"}
28
+ # Column name endings that suggest a foreign key
29
+ _FK_SUFFIXES = ("_id", "_pk", "_key", "_fk", "_ref", "_uuid")
30
+
31
+
32
+ def _name_lower(table_name: str) -> str:
33
+ """Extract bare table name (no catalog/schema) and lowercase it."""
34
+ return table_name.split(".")[-1].lower()
35
+
36
+
37
+ def _starts_with_any(name: str, prefixes: set[str]) -> bool:
38
+ return any(name.startswith(p) for p in prefixes)
39
+
40
+
41
+ def _ends_with_any(name: str, suffixes: set | tuple) -> bool:
42
+ return any(name.endswith(s) for s in suffixes)
43
+
44
+
45
+ def count_fk_columns(columns: list[dict]) -> int:
46
+ """Count columns that look like foreign keys."""
47
+ return sum(
48
+ 1 for c in columns
49
+ if c.get("name", "").lower() != "id"
50
+ and _ends_with_any(c.get("name", "").lower(), _FK_SUFFIXES)
51
+ )
52
+
53
+
54
+ def has_primary_key(columns: list[dict]) -> bool:
55
+ """True if there's a column that looks like a primary key."""
56
+ names = {c.get("name", "").lower() for c in columns}
57
+ return bool(names & _PK_PATTERNS) or any(
58
+ n == "id" or _ends_with_any(n, ("_id",)) and len(n) <= 10
59
+ for n in names
60
+ )
61
+
62
+
63
+ def classify_table(
64
+ full_name: str,
65
+ columns: list[dict],
66
+ n_upstream: int = 0,
67
+ n_downstream: int = 0,
68
+ ) -> tuple[str, float]:
69
+ """
70
+ Classify a table's role.
71
+
72
+ Returns (role: str, confidence: float).
73
+
74
+ confidence is in [0.0, 1.0]:
75
+ >= 0.85 → strong signal (name prefix, junction shape)
76
+ 0.60–0.84 → moderate signal (position in lineage + shape)
77
+ < 0.60 → weak / unknown
78
+ """
79
+ name = _name_lower(full_name)
80
+ n_cols = len(columns)
81
+ n_fk = count_fk_columns(columns)
82
+ has_pk = has_primary_key(columns)
83
+
84
+ # ── Staging ──
85
+ if _starts_with_any(name, _STAGING_PREFIXES):
86
+ return "staging", 0.90
87
+
88
+ # ── Aggregation ──
89
+ if _ends_with_any(name, _AGG_SUFFIXES):
90
+ return "aggregation", 0.90
91
+ if _starts_with_any(name, _FACT_PREFIXES) and n_upstream > 0:
92
+ return "aggregation", 0.75
93
+
94
+ # ── Dimension / Entity ──
95
+ if _starts_with_any(name, _DIMENSION_PREFIXES):
96
+ return "entity", 0.90
97
+
98
+ # ── Junction ──
99
+ if _ends_with_any(name, _JUNCTION_SUFFIXES):
100
+ return "junction", 0.88
101
+ if n_cols >= 2 and n_fk >= 2 and n_fk / max(n_cols, 1) >= 0.6:
102
+ # Mostly FK columns → junction/bridge table
103
+ return "junction", 0.80
104
+
105
+ # ── Entity ──
106
+ # Root source with a PK and meaningful columns
107
+ if n_upstream == 0 and has_pk and n_cols >= 3:
108
+ return "entity", 0.82
109
+ if n_upstream == 0 and n_cols >= 5:
110
+ return "entity", 0.65
111
+
112
+ # ── Fact ──
113
+ # Has upstream (transformed from somewhere) + FK columns
114
+ if n_upstream >= 1 and n_fk >= 1 and n_downstream >= 1:
115
+ return "fact", 0.70
116
+ if n_upstream >= 1 and n_fk >= 2:
117
+ return "fact", 0.65
118
+
119
+ # ── Aggregation by position ──
120
+ if n_upstream >= 2 and n_downstream == 0:
121
+ return "aggregation", 0.60
122
+
123
+ return "unknown", 0.40
124
+
125
+
126
+ def classify_all(
127
+ tables: dict, # {full_name: {"columns": [...], "role": ...}}
128
+ upstream_counts: dict[str, int],
129
+ downstream_counts: dict[str, int],
130
+ ) -> dict[str, tuple[str, float]]:
131
+ """
132
+ Classify every table in the graph.
133
+
134
+ Returns {full_name: (role, confidence)}.
135
+ """
136
+ return {
137
+ name: classify_table(
138
+ name,
139
+ info.get("columns", []),
140
+ upstream_counts.get(name, 0),
141
+ downstream_counts.get(name, 0),
142
+ )
143
+ for name, info in tables.items()
144
+ }
dashgov/lineage.py ADDED
@@ -0,0 +1,312 @@
1
+ """
2
+ Lineage graph — table-level and column-level data lineage.
3
+
4
+ Works with plain Python dicts so it is fully testable without Spark or UC.
5
+ Use fetch_uc_lineage() to pull live data from a Unity Catalog workspace.
6
+ """
7
+ from __future__ import annotations
8
+ from dataclasses import dataclass
9
+ from collections import deque
10
+ from typing import Optional
11
+
12
+
13
+ @dataclass
14
+ class TableNode:
15
+ full_name: str # catalog.schema.table
16
+ catalog: str
17
+ schema_name: str
18
+ table: str
19
+ columns: list[dict] # [{"name": str, "type": str, "nullable": bool}]
20
+ role: str = "unknown" # entity | fact | junction | aggregation | staging | unknown
21
+
22
+
23
+ @dataclass
24
+ class TableEdge:
25
+ source: str # full table name
26
+ target: str # full table name
27
+
28
+
29
+ @dataclass
30
+ class ColumnEdge:
31
+ source_table: str
32
+ source_column: str
33
+ target_table: str
34
+ target_column: str
35
+ transformation: Optional[str] = None # SQL expression when known
36
+
37
+
38
+ class LineageGraph:
39
+ """Directed acyclic graph of table and column lineage."""
40
+
41
+ def __init__(
42
+ self,
43
+ tables: dict[str, TableNode],
44
+ table_edges: list[TableEdge],
45
+ column_edges: list[ColumnEdge],
46
+ ):
47
+ self.tables = tables
48
+ self.table_edges = table_edges
49
+ self.column_edges = column_edges
50
+
51
+ # adjacency: source → {targets}
52
+ self._downstream: dict[str, set[str]] = {}
53
+ self._upstream: dict[str, set[str]] = {}
54
+ for e in table_edges:
55
+ self._downstream.setdefault(e.source, set()).add(e.target)
56
+ self._upstream.setdefault(e.target, set()).add(e.source)
57
+
58
+ # ── Table-level traversal ────────────────────────────────────────────────
59
+
60
+ def upstream_tables(self, table: str, depth: int = 1) -> list[str]:
61
+ """All tables that feed into *table*, up to *depth* hops."""
62
+ return self._bfs(table, self._upstream, depth)
63
+
64
+ def downstream_tables(self, table: str, depth: int = 1) -> list[str]:
65
+ """All tables that consume from *table*, up to *depth* hops."""
66
+ return self._bfs(table, self._downstream, depth)
67
+
68
+ def root_sources(self, table: str) -> list[str]:
69
+ """Tables with no upstream that eventually feed into *table*."""
70
+ visited, result = set(), []
71
+ stack = [table]
72
+ while stack:
73
+ t = stack.pop()
74
+ if t in visited:
75
+ continue
76
+ visited.add(t)
77
+ ups = list(self._upstream.get(t, []))
78
+ if not ups and t != table:
79
+ result.append(t)
80
+ stack.extend(ups)
81
+ return sorted(result)
82
+
83
+ def impact_analysis(self, table: str) -> dict:
84
+ """What breaks if *table* changes — full downstream tree."""
85
+ direct = sorted(self._downstream.get(table, []))
86
+ all_downstream = self._bfs(table, self._downstream, depth=999)
87
+ col_targets = {}
88
+ for ce in self.column_edges:
89
+ if ce.source_table == table:
90
+ col_targets.setdefault(ce.source_column, []).append(
91
+ f"{ce.target_table}.{ce.target_column}"
92
+ )
93
+ return {
94
+ "table": table,
95
+ "direct_dependents": direct,
96
+ "all_downstream": all_downstream,
97
+ "affected_column_paths": col_targets,
98
+ "total_affected_tables": len(all_downstream),
99
+ }
100
+
101
+ # ── Column-level traversal ───────────────────────────────────────────────
102
+
103
+ def column_sources(self, table: str, column: str) -> list[ColumnEdge]:
104
+ """Edges that feed into *table.column*."""
105
+ return [
106
+ e for e in self.column_edges
107
+ if e.target_table == table and e.target_column == column
108
+ ]
109
+
110
+ def column_targets(self, table: str, column: str) -> list[ColumnEdge]:
111
+ """Edges that *table.column* feeds into."""
112
+ return [
113
+ e for e in self.column_edges
114
+ if e.source_table == table and e.source_column == column
115
+ ]
116
+
117
+ def column_lineage_chain(self, table: str, column: str) -> dict:
118
+ """Full upstream chain for a single column."""
119
+ visited, upstream = set(), []
120
+ stack = [(table, column)]
121
+ while stack:
122
+ t, c = stack.pop()
123
+ key = f"{t}.{c}"
124
+ if key in visited:
125
+ continue
126
+ visited.add(key)
127
+ for src in self.column_sources(t, c):
128
+ upstream.append({"table": src.source_table, "column": src.source_column})
129
+ stack.append((src.source_table, src.source_column))
130
+ return {
131
+ "table": table,
132
+ "column": column,
133
+ "upstream_columns": upstream,
134
+ }
135
+
136
+ # ── Export ───────────────────────────────────────────────────────────────
137
+
138
+ def to_dict(self) -> dict:
139
+ return {
140
+ "tables": {
141
+ k: {
142
+ "full_name": v.full_name,
143
+ "catalog": v.catalog,
144
+ "schema_name": v.schema_name,
145
+ "table": v.table,
146
+ "columns": v.columns,
147
+ "role": v.role,
148
+ }
149
+ for k, v in self.tables.items()
150
+ },
151
+ "table_edges": [
152
+ {"source": e.source, "target": e.target} for e in self.table_edges
153
+ ],
154
+ "column_edges": [
155
+ {
156
+ "source_table": e.source_table,
157
+ "source_column": e.source_column,
158
+ "target_table": e.target_table,
159
+ "target_column": e.target_column,
160
+ "transformation": e.transformation,
161
+ }
162
+ for e in self.column_edges
163
+ ],
164
+ }
165
+
166
+ def summary(self) -> dict:
167
+ return {
168
+ "total_tables": len(self.tables),
169
+ "total_table_edges": len(self.table_edges),
170
+ "total_column_edges": len(self.column_edges),
171
+ "root_sources": [t for t in self.tables if t not in self._upstream],
172
+ "leaf_sinks": [t for t in self.tables if t not in self._downstream],
173
+ }
174
+
175
+ # ── Internal ─────────────────────────────────────────────────────────────
176
+
177
+ def _bfs(self, start: str, adj: dict, depth: int) -> list[str]:
178
+ visited, result = {start}, []
179
+ queue = deque([(start, 0)])
180
+ while queue:
181
+ node, d = queue.popleft()
182
+ if d >= depth:
183
+ continue
184
+ for neighbour in adj.get(node, []):
185
+ if neighbour not in visited:
186
+ visited.add(neighbour)
187
+ result.append(neighbour)
188
+ queue.append((neighbour, d + 1))
189
+ return result
190
+
191
+
192
+ # ── Constructors ─────────────────────────────────────────────────────────────
193
+
194
+ def build_lineage_graph(
195
+ tables: list[dict],
196
+ table_edges: list[dict],
197
+ column_edges: list[dict],
198
+ ) -> LineageGraph:
199
+ """
200
+ Build a LineageGraph from plain dicts.
201
+
202
+ tables — [{"full_name": str, "columns": [{name, type, nullable}], ...}]
203
+ table_edges — [{"source": str, "target": str}]
204
+ column_edges — [{"source_table", "source_column", "target_table", "target_column"}]
205
+ """
206
+ nodes: dict[str, TableNode] = {}
207
+ for t in tables:
208
+ full = t["full_name"]
209
+ parts = full.split(".")
210
+ cat = parts[0] if len(parts) >= 3 else ""
211
+ sch = parts[1] if len(parts) >= 3 else (parts[0] if len(parts) == 2 else "")
212
+ tbl = parts[-1]
213
+ nodes[full] = TableNode(
214
+ full_name=full,
215
+ catalog=cat,
216
+ schema_name=sch,
217
+ table=tbl,
218
+ columns=t.get("columns", []),
219
+ role=t.get("role", "unknown"),
220
+ )
221
+
222
+ t_edges = [TableEdge(e["source"], e["target"]) for e in table_edges]
223
+ c_edges = [
224
+ ColumnEdge(
225
+ source_table=e["source_table"],
226
+ source_column=e["source_column"],
227
+ target_table=e["target_table"],
228
+ target_column=e["target_column"],
229
+ transformation=e.get("transformation"),
230
+ )
231
+ for e in column_edges
232
+ ]
233
+ return LineageGraph(nodes, t_edges, c_edges)
234
+
235
+
236
+ def fetch_uc_lineage(
237
+ table: str,
238
+ workspace_url: str,
239
+ token: str,
240
+ depth: int = 2,
241
+ ) -> dict:
242
+ """
243
+ Fetch table-level and column-level lineage from Unity Catalog REST API.
244
+
245
+ Returns a dict compatible with build_lineage_graph().
246
+ Requires workspace_url (https://...) and a Databricks PAT.
247
+ """
248
+ try:
249
+ import requests
250
+ except ImportError:
251
+ raise RuntimeError("requests is required: pip install requests")
252
+
253
+ headers = {"Authorization": f"Bearer {token}"}
254
+ base = workspace_url.rstrip("/")
255
+
256
+ visited_tables: set[str] = set()
257
+ table_edges: list[dict] = []
258
+ column_edges: list[dict] = []
259
+ queue = deque([table])
260
+ visited_tables.add(table)
261
+
262
+ for _ in range(depth):
263
+ next_queue: deque = deque()
264
+ while queue:
265
+ t = queue.popleft()
266
+ resp = requests.get(
267
+ f"{base}/api/2.0/lineage-tracking/table-lineages",
268
+ headers=headers,
269
+ params={"table_name": t},
270
+ timeout=15,
271
+ )
272
+ if resp.status_code != 200:
273
+ continue
274
+ data = resp.json()
275
+ for up in data.get("upstream_tables", []):
276
+ src = up.get("name", "")
277
+ if src and src not in visited_tables:
278
+ visited_tables.add(src)
279
+ table_edges.append({"source": src, "target": t})
280
+ next_queue.append(src)
281
+ for down in data.get("downstream_tables", []):
282
+ tgt = down.get("name", "")
283
+ if tgt and tgt not in visited_tables:
284
+ visited_tables.add(tgt)
285
+ table_edges.append({"source": t, "target": tgt})
286
+ next_queue.append(tgt)
287
+ queue = next_queue
288
+
289
+ # Column lineage for the root table
290
+ col_resp = requests.get(
291
+ f"{base}/api/2.0/lineage-tracking/column-lineages",
292
+ headers=headers,
293
+ params={"table_name": table},
294
+ timeout=15,
295
+ )
296
+ if col_resp.status_code == 200:
297
+ for col_data in col_resp.json().get("column_lineage", []):
298
+ tgt_col = col_data.get("name", "")
299
+ for up in col_data.get("upstream_columns", []):
300
+ column_edges.append({
301
+ "source_table": up.get("table_name", ""),
302
+ "source_column": up.get("name", ""),
303
+ "target_table": table,
304
+ "target_column": tgt_col,
305
+ })
306
+
307
+ tables_list = [{"full_name": t, "columns": []} for t in visited_tables]
308
+ return {
309
+ "tables": tables_list,
310
+ "table_edges": table_edges,
311
+ "column_edges": column_edges,
312
+ }
dashgov/parser.py ADDED
@@ -0,0 +1,201 @@
1
+ """
2
+ SQL-based lineage extraction.
3
+
4
+ Parses CREATE TABLE AS SELECT, INSERT INTO SELECT, and plain SELECT
5
+ statements to extract table-level and column-level lineage without
6
+ requiring a live Unity Catalog connection.
7
+
8
+ Requires sqlglot (pure Python, no Spark dependency).
9
+ """
10
+ from __future__ import annotations
11
+
12
+
13
+ def _sqlglot():
14
+ try:
15
+ import sqlglot
16
+ return sqlglot
17
+ except ImportError:
18
+ raise RuntimeError("sqlglot is required: pip install sqlglot")
19
+
20
+
21
+ def parse_table_lineage(sql: str, dialect: str = "spark") -> dict:
22
+ """
23
+ Extract table-level lineage from a SQL statement.
24
+
25
+ Returns:
26
+ {
27
+ "target": str | None, # the table being written to
28
+ "sources": [str, ...], # tables being read from
29
+ "type": "ctas"|"insert"|"select"|"unknown"
30
+ }
31
+ """
32
+ sg = _sqlglot()
33
+ exp = sg.exp
34
+
35
+ try:
36
+ stmt = sg.parse_one(sql, dialect=dialect)
37
+ except Exception:
38
+ return {"target": None, "sources": [], "type": "unknown"}
39
+
40
+ def _full(tbl) -> str:
41
+ parts = [p for p in (tbl.catalog, tbl.db, tbl.name) if p]
42
+ return ".".join(parts) if parts else (tbl.name or "")
43
+
44
+ def _table_names(node) -> list[str]:
45
+ return [_full(t) for t in node.find_all(exp.Table) if t.name]
46
+
47
+ if isinstance(stmt, exp.Create):
48
+ tbl = stmt.find(exp.Table)
49
+ target_full = _full(tbl) if tbl else None
50
+ target_short = tbl.name if tbl else None
51
+ all_names = _table_names(stmt)
52
+ sources = [n for n in all_names if n != target_full]
53
+ return {"target": target_short, "sources": list(dict.fromkeys(sources)), "type": "ctas"}
54
+
55
+ if isinstance(stmt, exp.Insert):
56
+ tbl = stmt.find(exp.Table)
57
+ target_full = _full(tbl) if tbl else None
58
+ target_short = tbl.name if tbl else None
59
+ inner = stmt.find(sg.exp.Select)
60
+ if inner:
61
+ sources = [n for n in _table_names(inner) if n != target_full]
62
+ else:
63
+ sources = []
64
+ return {"target": target_short, "sources": list(dict.fromkeys(sources)), "type": "insert"}
65
+
66
+ if isinstance(stmt, (exp.Select, exp.Subquery)):
67
+ sources = list(dict.fromkeys(_table_names(stmt)))
68
+ return {"target": None, "sources": sources, "type": "select"}
69
+
70
+ return {"target": None, "sources": [], "type": "unknown"}
71
+
72
+
73
+ def parse_column_lineage(
74
+ sql: str,
75
+ target_table: str,
76
+ dialect: str = "spark",
77
+ ) -> list[dict]:
78
+ """
79
+ Extract column-level lineage from a SQL statement.
80
+
81
+ Returns list of:
82
+ {
83
+ "target_column": str,
84
+ "source_table": str | None,
85
+ "source_column": str | None,
86
+ "expression": str | None, # for computed columns
87
+ }
88
+
89
+ Only handles direct column references. Complex expressions
90
+ (aggregations, UDFs) are returned with expression set to the SQL text.
91
+ """
92
+ sg = _sqlglot()
93
+ exp = sg.exp
94
+
95
+ try:
96
+ stmt = sg.parse_one(sql, dialect=dialect)
97
+ except Exception:
98
+ return []
99
+
100
+ # Unwrap CREATE TABLE AS SELECT / INSERT INTO SELECT
101
+ select = stmt.find(exp.Select)
102
+ if select is None:
103
+ if isinstance(stmt, exp.Select):
104
+ select = stmt
105
+ else:
106
+ return []
107
+
108
+ # Build alias map: alias → real table name
109
+ alias_map: dict[str, str] = {}
110
+ for from_expr in select.find_all(exp.From):
111
+ tbl = from_expr.find(exp.Table)
112
+ if tbl:
113
+ alias_map[tbl.alias or tbl.name] = tbl.name
114
+
115
+ for join in select.find_all(exp.Join):
116
+ tbl = join.find(exp.Table)
117
+ if tbl:
118
+ alias_map[tbl.alias or tbl.name] = tbl.name
119
+
120
+ result = []
121
+ for sel in select.selects:
122
+ alias = sel.alias or (sel.name if hasattr(sel, "name") else None)
123
+ target_col = alias or str(sel)
124
+
125
+ if isinstance(sel, (exp.Column, exp.Alias)):
126
+ col_node = sel.find(exp.Column) if isinstance(sel, exp.Alias) else sel
127
+ if col_node:
128
+ tbl_alias = (
129
+ col_node.table if hasattr(col_node, "table") else None
130
+ )
131
+ src_tbl = alias_map.get(tbl_alias, tbl_alias) if tbl_alias else None
132
+ src_col = col_node.name if hasattr(col_node, "name") else None
133
+ result.append({
134
+ "target_table": target_table,
135
+ "target_column": target_col,
136
+ "source_table": src_tbl,
137
+ "source_column": src_col,
138
+ "expression": None,
139
+ })
140
+ else:
141
+ result.append({
142
+ "target_table": target_table,
143
+ "target_column": target_col,
144
+ "source_table": None,
145
+ "source_column": None,
146
+ "expression": str(sel),
147
+ })
148
+ else:
149
+ result.append({
150
+ "target_table": target_table,
151
+ "target_column": target_col,
152
+ "source_table": None,
153
+ "source_column": None,
154
+ "expression": str(sel),
155
+ })
156
+
157
+ return result
158
+
159
+
160
+ def parse_notebook_lineage(sql_cells: list[str], dialect: str = "spark") -> dict:
161
+ """
162
+ Parse multiple SQL cells from a notebook and build combined lineage.
163
+
164
+ Returns:
165
+ {
166
+ "table_edges": [{"source": str, "target": str}, ...],
167
+ "column_edges": [{...}, ...],
168
+ "statements": int,
169
+ "parsed": int,
170
+ }
171
+ """
172
+ table_edges: list[dict] = []
173
+ column_edges: list[dict] = []
174
+ parsed = 0
175
+
176
+ for cell in sql_cells:
177
+ cell = cell.strip()
178
+ if not cell:
179
+ continue
180
+ tl = parse_table_lineage(cell, dialect=dialect)
181
+ if tl["target"] and tl["sources"]:
182
+ parsed += 1
183
+ for src in tl["sources"]:
184
+ table_edges.append({"source": src, "target": tl["target"]})
185
+ cl = parse_column_lineage(cell, tl["target"], dialect=dialect)
186
+ for ce in cl:
187
+ if ce["source_table"] and ce["source_column"]:
188
+ column_edges.append({
189
+ "source_table": ce["source_table"],
190
+ "source_column": ce["source_column"],
191
+ "target_table": ce["target_table"],
192
+ "target_column": ce["target_column"],
193
+ "transformation": ce.get("expression"),
194
+ })
195
+
196
+ return {
197
+ "table_edges": table_edges,
198
+ "column_edges": column_edges,
199
+ "statements": len(sql_cells),
200
+ "parsed": parsed,
201
+ }
dashgov/scanner.py ADDED
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ import re
4
+
5
+
6
+ PII_PATTERNS = {
7
+ "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
8
+ "phone": r"\+?\d[\d\s\-().]{7,}\d",
9
+ "credit_card": r"\b(?:\d[ -]?){13,16}\b",
10
+ "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
11
+ "passport": r"\b[A-Z]{1,2}\d{6,9}\b",
12
+ "national_id": r"\b\d{3}-\d{3}-\d{4}-\d\b",
13
+ }
14
+
15
+ SENSITIVITY_KEYWORDS = {
16
+ "HIGH": ["salary", "income", "password", "secret", "credit_card", "ssn", "passport",
17
+ "national_id", "emirates_id", "iban", "account_number"],
18
+ "MEDIUM": ["email", "phone", "address", "dob", "birth", "gender", "nationality"],
19
+ "LOW": ["name", "city", "country", "region", "department"],
20
+ }
21
+
22
+
23
+ class GovernanceScanner:
24
+ """
25
+ Scan Databricks tables for PII, classify sensitivity, and apply tags.
26
+
27
+ Usage::
28
+ scanner = GovernanceScanner(table="catalog.schema.customers")
29
+ report = scanner.scan()
30
+ report.display()
31
+ report.apply_tags() # writes UC column tags
32
+ """
33
+
34
+ def __init__(self, df=None, table: str = None):
35
+ self._table = table
36
+ self._df = self._load(df, table)
37
+
38
+ def _load(self, df, table):
39
+ if df is not None:
40
+ return df
41
+ from pyspark.sql import SparkSession
42
+ return SparkSession.getActiveSession().table(table)
43
+
44
+ def scan(self, sample_rows: int = 1000) -> "GovReport":
45
+ findings = {}
46
+ schema = self._df.schema
47
+ sample = self._df.limit(sample_rows)
48
+
49
+ for field in schema.fields:
50
+ col_name = field.name
51
+ dtype = str(field.dataType)
52
+ sensitivity = self._infer_sensitivity(col_name)
53
+ pii_types = []
54
+
55
+ if "String" in dtype:
56
+ col_vals = [r[col_name] for r in sample.select(col_name).collect()
57
+ if r[col_name] is not None]
58
+ pii_types = self._detect_pii(col_vals)
59
+
60
+ findings[col_name] = {
61
+ "dtype": dtype,
62
+ "sensitivity": sensitivity,
63
+ "pii_types": pii_types,
64
+ "has_pii": len(pii_types) > 0,
65
+ }
66
+
67
+ return GovReport(self._table, findings)
68
+
69
+ def _infer_sensitivity(self, col_name: str) -> str:
70
+ lower = col_name.lower()
71
+ for level, keywords in SENSITIVITY_KEYWORDS.items():
72
+ if any(kw in lower for kw in keywords):
73
+ return level
74
+ return "NONE"
75
+
76
+ def _detect_pii(self, values: list[str]) -> list[str]:
77
+ detected = set()
78
+ sample = values[:200]
79
+ for pii_type, pattern in PII_PATTERNS.items():
80
+ if any(re.search(pattern, str(v)) for v in sample):
81
+ detected.add(pii_type)
82
+ return list(detected)
83
+
84
+
85
+ class GovReport:
86
+ def __init__(self, table: Optional[str], findings: dict):
87
+ self.table = table
88
+ self.findings = findings
89
+
90
+ def display(self):
91
+ print(f"Governance scan: {self.table or 'DataFrame'}")
92
+ print(f"{'Column':<30} {'Sensitivity':<12} {'PII Types'}")
93
+ print("-" * 65)
94
+ for col, info in self.findings.items():
95
+ pii = ", ".join(info["pii_types"]) or "—"
96
+ print(f"{col:<30} {info['sensitivity']:<12} {pii}")
97
+
98
+ def apply_tags(self):
99
+ """Write Unity Catalog column tags for sensitivity classification."""
100
+ if not self.table:
101
+ print("⚠️ No table name — cannot apply UC tags")
102
+ return
103
+ from pyspark.sql import SparkSession
104
+ spark = SparkSession.getActiveSession()
105
+ for col, info in self.findings.items():
106
+ if info["sensitivity"] != "NONE":
107
+ try:
108
+ spark.sql(
109
+ f"ALTER TABLE {self.table} ALTER COLUMN `{col}` "
110
+ f"SET TAGS ('sensitivity' = '{info['sensitivity']}')"
111
+ )
112
+ except Exception as e:
113
+ print(f" ⚠️ Could not tag {col}: {e}")
114
+ print(f"✅ Tags applied to {self.table}")
115
+
116
+ def to_dict(self) -> dict:
117
+ return self.findings
dashgov/ui.py ADDED
@@ -0,0 +1,167 @@
1
+ """DashGov interactive UI for Databricks notebooks."""
2
+ from __future__ import annotations
3
+
4
+
5
+ def _lineage_html(graph_dict: dict, focus_table: str = "") -> str:
6
+ """Render a lineage graph as a simple HTML DAG (upstream → focus → downstream)."""
7
+ tables = graph_dict.get("tables", {})
8
+ edges = graph_dict.get("table_edges", [])
9
+
10
+ upstream = {e["source"] for e in edges if e["target"] == focus_table}
11
+ downstream = {e["target"] for e in edges if e["source"] == focus_table}
12
+
13
+ role_colors = {
14
+ "entity": "#2563eb",
15
+ "fact": "#16a34a",
16
+ "junction": "#7c3aed",
17
+ "aggregation": "#d97706",
18
+ "staging": "#6b7280",
19
+ "unknown": "#9ca3af",
20
+ }
21
+
22
+ def _box(name: str, pos: str) -> str:
23
+ short = name.split(".")[-1]
24
+ role = tables.get(name, {}).get("role", "unknown")
25
+ color = role_colors.get(role, "#9ca3af")
26
+ border = "3px solid #1d4ed8" if pos == "focus" else "1px solid #d1d5db"
27
+ bg = "#eff6ff" if pos == "focus" else "#f9fafb"
28
+ return (
29
+ f"<div style='padding:8px 12px;border:{border};border-radius:6px;"
30
+ f"background:{bg};color:#111;font-size:12px;margin:4px;display:inline-block'>"
31
+ f"<span style='color:{color};font-weight:600'>{short}</span>"
32
+ f"<br/><span style='font-size:10px;color:#6b7280'>{role}</span></div>"
33
+ )
34
+
35
+ up_html = "".join(_box(t, "up") for t in sorted(upstream))
36
+ focus_html = _box(focus_table, "focus") if focus_table else ""
37
+ down_html = "".join(_box(t, "down") for t in sorted(downstream))
38
+ arrow = "<div style='font-size:20px;color:#9ca3af;margin:0 8px'>→</div>"
39
+
40
+ return (
41
+ "<div style='display:flex;align-items:center;flex-wrap:wrap;gap:4px;"
42
+ "font-family:monospace;padding:12px;background:#fff;border-radius:8px;"
43
+ "border:1px solid #e5e7eb'>"
44
+ f"<div style='display:flex;flex-direction:column'>{up_html}</div>"
45
+ f"{arrow if upstream else ''}"
46
+ f"{focus_html}"
47
+ f"{arrow if downstream else ''}"
48
+ f"<div style='display:flex;flex-direction:column'>{down_html}</div>"
49
+ "</div>"
50
+ )
51
+
52
+
53
+ def launch():
54
+ try:
55
+ import ipywidgets as w
56
+ from IPython.display import display
57
+ except ImportError:
58
+ raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets")
59
+
60
+ import dashui
61
+
62
+ # ── SQL parser ────────────────────────────────────────────────────────────
63
+ sql_input = w.Textarea(
64
+ description="SQL:",
65
+ placeholder="Paste CREATE TABLE AS SELECT or INSERT INTO SELECT ...",
66
+ layout=w.Layout(width="100%", height="120px"),
67
+ )
68
+ dialect_toggle = w.ToggleButtons(
69
+ options=["spark", "snowflake", "bigquery", "trino"],
70
+ description="Dialect:",
71
+ value="spark",
72
+ )
73
+ parse_btn = dashui.action_button("Parse Lineage from SQL", style="info", emoji="🔍")
74
+ parse_output = dashui.output_panel()
75
+
76
+ def on_parse(b):
77
+ with parse_output:
78
+ parse_output.clear_output()
79
+ sql = sql_input.value.strip()
80
+ if not sql:
81
+ print("⚠️ Paste a SQL statement above")
82
+ return
83
+ try:
84
+ from dashgov.parser import parse_table_lineage, parse_column_lineage
85
+ tl = parse_table_lineage(sql, dialect=dialect_toggle.value)
86
+ print(f"Type : {tl['type']}")
87
+ print(f"Target : {tl['target'] or '—'}")
88
+ print(f"Sources : {', '.join(tl['sources']) or '—'}")
89
+ if tl["target"]:
90
+ cl = parse_column_lineage(sql, tl["target"], dialect=dialect_toggle.value)
91
+ if cl:
92
+ print("\nColumn lineage:")
93
+ for c in cl:
94
+ src = (
95
+ f"{c['source_table']}.{c['source_column']}"
96
+ if c["source_table"] else c.get("expression", "?")
97
+ )
98
+ print(f" {src:40s} → {c['target_column']}")
99
+ except Exception as e:
100
+ print(f"❌ {e}")
101
+
102
+ parse_btn.on_click(on_parse)
103
+
104
+ # ── UC live lineage ───────────────────────────────────────────────────────
105
+ uc_workspace = w.Text(
106
+ description="Workspace URL:",
107
+ placeholder="https://adb-xxx.azuredatabricks.net",
108
+ )
109
+ uc_token = w.Password(description="Token:", placeholder="dapixxxxxxxx")
110
+ uc_table = w.Text(description="Table:", placeholder="catalog.schema.table")
111
+ uc_depth = w.IntSlider(description="Depth:", value=2, min=1, max=5)
112
+ uc_btn = dashui.action_button("Fetch UC Lineage", style="success", emoji="🌐")
113
+ uc_output = dashui.output_panel()
114
+ lineage_viz = w.HTML(value="")
115
+
116
+ def on_uc_fetch(b):
117
+ with uc_output:
118
+ uc_output.clear_output()
119
+ url = uc_workspace.value.strip()
120
+ tok = uc_token.value.strip()
121
+ tbl = uc_table.value.strip()
122
+ if not (url and tok and tbl):
123
+ print("⚠️ Fill in workspace URL, token, and table name")
124
+ return
125
+ try:
126
+ from dashgov.lineage import fetch_uc_lineage, build_lineage_graph
127
+ raw = fetch_uc_lineage(tbl, url, tok, depth=uc_depth.value)
128
+ graph = build_lineage_graph(
129
+ raw["tables"], raw["table_edges"], raw["column_edges"]
130
+ )
131
+ s = graph.summary()
132
+ print(f"Tables : {s['total_tables']}")
133
+ print(f"Edges : {s['total_table_edges']} table, {s['total_column_edges']} column")
134
+ print(f"Roots : {', '.join(s['root_sources']) or '—'}")
135
+ print(f"Sinks : {', '.join(s['leaf_sinks']) or '—'}")
136
+ lineage_viz.value = _lineage_html(graph.to_dict(), focus_table=tbl)
137
+ imp = graph.impact_analysis(tbl)
138
+ if imp["all_downstream"]:
139
+ print(f"\nImpact if {tbl} changes:")
140
+ for t in imp["all_downstream"]:
141
+ print(f" ↓ {t}")
142
+ except Exception as e:
143
+ print(f"❌ {e}")
144
+
145
+ uc_btn.on_click(on_uc_fetch)
146
+
147
+ ui = dashui.card([
148
+ dashui.header("DashGov — Data Lineage & Governance", library="dashgov", emoji="🔗"),
149
+
150
+ dashui.section("Step 1: Parse lineage from SQL"),
151
+ dashui.html(
152
+ "<div style='font-size:12px;color:#666;margin-bottom:4px'>"
153
+ "Paste a CREATE TABLE AS SELECT or INSERT INTO SELECT to extract "
154
+ "table and column lineage without a UC connection.</div>"
155
+ ),
156
+ sql_input, dialect_toggle, parse_btn, parse_output,
157
+
158
+ dashui.section("Step 2: Fetch live lineage from Unity Catalog"),
159
+ dashui.html(
160
+ "<div style='font-size:12px;color:#666;margin-bottom:4px'>"
161
+ "Requires a Databricks workspace URL and personal access token.</div>"
162
+ ),
163
+ w.HBox([uc_workspace, uc_token]),
164
+ w.HBox([uc_table, uc_depth]),
165
+ uc_btn, uc_output, lineage_viz,
166
+ ])
167
+ display(ui)