dash-gov 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dash_gov-0.1.1/.github/workflows/ci.yml +53 -0
- dash_gov-0.1.1/.github/workflows/daily.yml +33 -0
- dash_gov-0.1.1/.github/workflows/release.yml +267 -0
- dash_gov-0.1.1/.gitignore +9 -0
- dash_gov-0.1.1/CLAUDE.md +21 -0
- dash_gov-0.1.1/PKG-INFO +64 -0
- dash_gov-0.1.1/README.md +35 -0
- dash_gov-0.1.1/dashgov/__init__.py +18 -0
- dash_gov-0.1.1/dashgov/classifier.py +144 -0
- dash_gov-0.1.1/dashgov/lineage.py +312 -0
- dash_gov-0.1.1/dashgov/parser.py +201 -0
- dash_gov-0.1.1/dashgov/scanner.py +117 -0
- dash_gov-0.1.1/dashgov/ui.py +167 -0
- dash_gov-0.1.1/docs/api/dashgov.html +1340 -0
- dash_gov-0.1.1/docs/api/index.html +7 -0
- dash_gov-0.1.1/docs/api/search.js +46 -0
- dash_gov-0.1.1/marketplace/listing.json +13 -0
- dash_gov-0.1.1/marketplace/quickstart_notebook.py +29 -0
- dash_gov-0.1.1/pyproject.toml +41 -0
- dash_gov-0.1.1/tests/test_classifier.py +166 -0
- dash_gov-0.1.1/tests/test_dashgov.py +22 -0
- dash_gov-0.1.1/tests/test_lineage.py +189 -0
- dash_gov-0.1.1/tests/test_parser.py +133 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.11"
|
|
17
|
+
- run: pip install ruff
|
|
18
|
+
- run: ruff check dashgov/
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
needs: lint
|
|
23
|
+
strategy:
|
|
24
|
+
matrix:
|
|
25
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-python@v5
|
|
29
|
+
with:
|
|
30
|
+
python-version: ${{ matrix.python-version }}
|
|
31
|
+
- name: Install
|
|
32
|
+
run: pip install -e ".[dev]" pytest pytest-cov
|
|
33
|
+
- name: Test
|
|
34
|
+
run: pytest tests/ -v --cov=dashgov --cov-report=xml
|
|
35
|
+
- name: Upload coverage
|
|
36
|
+
uses: codecov/codecov-action@v4
|
|
37
|
+
with:
|
|
38
|
+
files: coverage.xml
|
|
39
|
+
|
|
40
|
+
build:
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
needs: test
|
|
43
|
+
steps:
|
|
44
|
+
- uses: actions/checkout@v4
|
|
45
|
+
- uses: actions/setup-python@v5
|
|
46
|
+
with:
|
|
47
|
+
python-version: "3.11"
|
|
48
|
+
- run: pip install hatch
|
|
49
|
+
- run: hatch build
|
|
50
|
+
- uses: actions/upload-artifact@v4
|
|
51
|
+
with:
|
|
52
|
+
name: dist
|
|
53
|
+
path: dist/
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Daily Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: "0 6 * * *" # Every day 06:00 UTC — tests only, no commit
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
|
|
23
|
+
- name: Install
|
|
24
|
+
run: pip install -e ".[dev]" pytest pytest-cov
|
|
25
|
+
|
|
26
|
+
- name: Run tests
|
|
27
|
+
run: pytest tests/ -v --cov=dashgov --cov-report=xml --cov-report=term-missing
|
|
28
|
+
|
|
29
|
+
- name: Upload coverage
|
|
30
|
+
uses: codecov/codecov-action@v4
|
|
31
|
+
with:
|
|
32
|
+
files: coverage.xml
|
|
33
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
name: Weekly Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: "0 9 * * 1" # Every Monday 09:00 UTC
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
inputs:
|
|
8
|
+
release_note:
|
|
9
|
+
description: "Optional release note (shown in GitHub release body)"
|
|
10
|
+
required: false
|
|
11
|
+
default: ""
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
# ── Gate: tests must pass ────────────────────────────────────────────────
|
|
15
|
+
test:
|
|
16
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
strategy:
|
|
19
|
+
fail-fast: true
|
|
20
|
+
matrix:
|
|
21
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: ${{ matrix.python-version }}
|
|
28
|
+
|
|
29
|
+
- name: Install
|
|
30
|
+
run: pip install -e ".[dev]" pytest pytest-cov
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: pytest tests/ -v --cov=dashgov --cov-report=xml --cov-report=term-missing
|
|
34
|
+
|
|
35
|
+
- name: Upload coverage
|
|
36
|
+
uses: codecov/codecov-action@v4
|
|
37
|
+
with:
|
|
38
|
+
files: coverage.xml
|
|
39
|
+
fail_ci_if_error: false
|
|
40
|
+
|
|
41
|
+
# ── Generate docs ────────────────────────────────────────────────────────
|
|
42
|
+
docs:
|
|
43
|
+
name: Generate API docs
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
needs: test
|
|
46
|
+
steps:
|
|
47
|
+
- uses: actions/checkout@v4
|
|
48
|
+
|
|
49
|
+
- uses: actions/setup-python@v5
|
|
50
|
+
with:
|
|
51
|
+
python-version: "3.11"
|
|
52
|
+
|
|
53
|
+
- name: Install
|
|
54
|
+
run: pip install -e ".[dev]" pdoc
|
|
55
|
+
|
|
56
|
+
- name: Generate docs
|
|
57
|
+
run: |
|
|
58
|
+
pdoc dashgov --output-dir docs/api --docformat google
|
|
59
|
+
echo "Docs generated at $(date -u)" > docs/api/.generated
|
|
60
|
+
|
|
61
|
+
- name: Upload docs artifact
|
|
62
|
+
uses: actions/upload-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: api-docs
|
|
65
|
+
path: docs/api/
|
|
66
|
+
|
|
67
|
+
# ── Release: tag, GitHub release, commit docs ────────────────────────────
|
|
68
|
+
release:
|
|
69
|
+
name: Bump version & release
|
|
70
|
+
runs-on: ubuntu-latest
|
|
71
|
+
needs: [test, docs]
|
|
72
|
+
permissions:
|
|
73
|
+
contents: write
|
|
74
|
+
outputs:
|
|
75
|
+
version: ${{ steps.bump.outputs.version }}
|
|
76
|
+
steps:
|
|
77
|
+
- uses: actions/checkout@v4
|
|
78
|
+
with:
|
|
79
|
+
fetch-depth: 0
|
|
80
|
+
|
|
81
|
+
- uses: actions/setup-python@v5
|
|
82
|
+
with:
|
|
83
|
+
python-version: "3.11"
|
|
84
|
+
|
|
85
|
+
- name: Install build tools
|
|
86
|
+
run: pip install hatch pdoc
|
|
87
|
+
|
|
88
|
+
- name: Bump patch version
|
|
89
|
+
id: bump
|
|
90
|
+
run: |
|
|
91
|
+
current=$(hatch version)
|
|
92
|
+
hatch version patch
|
|
93
|
+
new=$(hatch version)
|
|
94
|
+
echo "version=$new" >> $GITHUB_OUTPUT
|
|
95
|
+
echo "prev_version=$current" >> $GITHUB_OUTPUT
|
|
96
|
+
echo "Bumped $current → $new"
|
|
97
|
+
|
|
98
|
+
- name: Regenerate docs into repo
|
|
99
|
+
run: |
|
|
100
|
+
pip install -e ".[dev]"
|
|
101
|
+
pdoc dashgov --output-dir docs/api --docformat google
|
|
102
|
+
|
|
103
|
+
- name: Build wheel + sdist
|
|
104
|
+
run: hatch build
|
|
105
|
+
|
|
106
|
+
- name: Write release notes
|
|
107
|
+
env:
|
|
108
|
+
VERSION: ${{ steps.bump.outputs.version }}
|
|
109
|
+
PREV_VERSION: ${{ steps.bump.outputs.prev_version }}
|
|
110
|
+
RELEASE_NOTE: ${{ github.event.inputs.release_note }}
|
|
111
|
+
run: |
|
|
112
|
+
cat > RELEASE_NOTES.md << EOF
|
|
113
|
+
## DashGov — Data Governance v${VERSION}
|
|
114
|
+
|
|
115
|
+
**Released:** $(date -u '+%Y-%m-%d')
|
|
116
|
+
**Previous:** v${PREV_VERSION}
|
|
117
|
+
|
|
118
|
+
$( [ -n "${RELEASE_NOTE}" ] && echo "### Notes" && echo "${RELEASE_NOTE}" || true )
|
|
119
|
+
|
|
120
|
+
### What's included
|
|
121
|
+
- All tests passing across Python 3.9, 3.10, 3.11, 3.12
|
|
122
|
+
- API documentation regenerated (see \`docs/api/\`)
|
|
123
|
+
- Published to PyPI and Databricks Marketplace
|
|
124
|
+
|
|
125
|
+
### Install
|
|
126
|
+
\`\`\`bash
|
|
127
|
+
pip install dash-gov==${VERSION}
|
|
128
|
+
\`\`\`
|
|
129
|
+
|
|
130
|
+
### Quick Start (Databricks notebook)
|
|
131
|
+
\`\`\`python
|
|
132
|
+
%pip install dash-gov==${VERSION}
|
|
133
|
+
import dashgov
|
|
134
|
+
dashgov.launch()
|
|
135
|
+
\`\`\`
|
|
136
|
+
EOF
|
|
137
|
+
|
|
138
|
+
- name: Commit version bump + docs to a release branch
|
|
139
|
+
env:
|
|
140
|
+
VERSION: ${{ steps.bump.outputs.version }}
|
|
141
|
+
run: |
|
|
142
|
+
git config user.name "github-actions[bot]"
|
|
143
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
144
|
+
git push origin --delete "refs/tags/v${VERSION}" 2>/dev/null || true
|
|
145
|
+
git push origin --delete "release/v${VERSION}" 2>/dev/null || true
|
|
146
|
+
git tag -d "v${VERSION}" 2>/dev/null || true
|
|
147
|
+
git checkout -b "release/v${VERSION}"
|
|
148
|
+
git add .
|
|
149
|
+
git commit -m "release: v${VERSION} — tests passed, docs updated"
|
|
150
|
+
git tag "v${VERSION}"
|
|
151
|
+
git push origin "release/v${VERSION}"
|
|
152
|
+
git push origin "v${VERSION}"
|
|
153
|
+
|
|
154
|
+
- name: Open and auto-merge release PR
|
|
155
|
+
continue-on-error: true
|
|
156
|
+
env:
|
|
157
|
+
GH_TOKEN: ${{ secrets.RELEASE_TOKEN || github.token }}
|
|
158
|
+
VERSION: ${{ steps.bump.outputs.version }}
|
|
159
|
+
run: |
|
|
160
|
+
gh pr create --base main --head "release/v${VERSION}" \
|
|
161
|
+
--title "release: v${VERSION}" \
|
|
162
|
+
--body "Automated release PR — tests passed, docs regenerated, version bumped to v${VERSION}." \
|
|
163
|
+
2>/dev/null || true
|
|
164
|
+
gh pr merge "release/v${VERSION}" --merge --admin --delete-branch
|
|
165
|
+
|
|
166
|
+
- name: Create GitHub Release
|
|
167
|
+
uses: softprops/action-gh-release@v2
|
|
168
|
+
with:
|
|
169
|
+
tag_name: "v${{ steps.bump.outputs.version }}"
|
|
170
|
+
body_path: RELEASE_NOTES.md
|
|
171
|
+
files: dist/*
|
|
172
|
+
|
|
173
|
+
- name: Upload dist artifact for PyPI job
|
|
174
|
+
uses: actions/upload-artifact@v4
|
|
175
|
+
with:
|
|
176
|
+
name: dist
|
|
177
|
+
path: dist/
|
|
178
|
+
|
|
179
|
+
# ── Publish to PyPI (Trusted Publisher / OIDC — no token needed) ─────────
|
|
180
|
+
publish-pypi:
|
|
181
|
+
name: Publish to PyPI
|
|
182
|
+
runs-on: ubuntu-latest
|
|
183
|
+
needs: release
|
|
184
|
+
permissions:
|
|
185
|
+
id-token: write # required for OIDC trusted publisher
|
|
186
|
+
environment:
|
|
187
|
+
name: pypi
|
|
188
|
+
url: https://pypi.org/project/dash-gov
|
|
189
|
+
steps:
|
|
190
|
+
- name: Download dist
|
|
191
|
+
uses: actions/download-artifact@v4
|
|
192
|
+
with:
|
|
193
|
+
name: dist
|
|
194
|
+
path: dist/
|
|
195
|
+
|
|
196
|
+
- name: Publish to PyPI
|
|
197
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
198
|
+
|
|
199
|
+
# ── Package for Databricks Marketplace ───────────────────────────────────
|
|
200
|
+
publish-databricks:
|
|
201
|
+
name: Package for Databricks Marketplace
|
|
202
|
+
runs-on: ubuntu-latest
|
|
203
|
+
needs: release
|
|
204
|
+
steps:
|
|
205
|
+
- uses: actions/checkout@v4
|
|
206
|
+
with:
|
|
207
|
+
ref: "v${{ needs.release.outputs.version }}"
|
|
208
|
+
|
|
209
|
+
- name: Download dist
|
|
210
|
+
uses: actions/download-artifact@v4
|
|
211
|
+
with:
|
|
212
|
+
name: dist
|
|
213
|
+
path: dist/
|
|
214
|
+
|
|
215
|
+
- name: Build Marketplace bundle
|
|
216
|
+
env:
|
|
217
|
+
VERSION: ${{ needs.release.outputs.version }}
|
|
218
|
+
run: |
|
|
219
|
+
mkdir -p marketplace-bundle/files
|
|
220
|
+
|
|
221
|
+
# Copy wheel
|
|
222
|
+
cp dist/*.whl marketplace-bundle/files/
|
|
223
|
+
|
|
224
|
+
# Generate companion notebook
|
|
225
|
+
cat > marketplace-bundle/files/DashGov — Data Governance_Quickstart.py << NBEOF
|
|
226
|
+
# Databricks notebook source
|
|
227
|
+
# MAGIC %md
|
|
228
|
+
# MAGIC # DashGov — Data Governance v${VERSION} for Databricks
|
|
229
|
+
# MAGIC Install and launch the interactive UI.
|
|
230
|
+
|
|
231
|
+
# COMMAND ----------
|
|
232
|
+
# MAGIC %pip install dash-gov==${VERSION}
|
|
233
|
+
|
|
234
|
+
# COMMAND ----------
|
|
235
|
+
dbutils.library.restartPython()
|
|
236
|
+
|
|
237
|
+
# COMMAND ----------
|
|
238
|
+
import dashgov
|
|
239
|
+
dashgov.launch()
|
|
240
|
+
NBEOF
|
|
241
|
+
|
|
242
|
+
# Generate listing metadata
|
|
243
|
+
cat > marketplace-bundle/listing.json << LEOF
|
|
244
|
+
{
|
|
245
|
+
"listing_name": "DashGov — Data Governance",
|
|
246
|
+
"version": "${VERSION}",
|
|
247
|
+
"short_description": "Scan tables for PII and apply Unity Catalog sensitivity tags",
|
|
248
|
+
"long_description": "DashGov — Data Governance provides an ipywidgets UI inside Databricks notebooks to scan Unity Catalog tables for PII and apply sensitivity tags. No coding required for business users.",
|
|
249
|
+
"categories": ["Data Governance,Compliance"],
|
|
250
|
+
"tags": ["governance", "pii", "databricks", "unity-catalog", "pyspark"],
|
|
251
|
+
"provider": "dash-libs",
|
|
252
|
+
"documentation_url": "https://github.com/dash-libs/dash-gov",
|
|
253
|
+
"source_url": "https://github.com/dash-libs/dash-gov",
|
|
254
|
+
"pypi_package": "dash-gov==${VERSION}"
|
|
255
|
+
}
|
|
256
|
+
LEOF
|
|
257
|
+
|
|
258
|
+
# Zip the bundle
|
|
259
|
+
cd marketplace-bundle && zip -r ../dashgov-marketplace-${VERSION}.zip .
|
|
260
|
+
echo "Bundle created: dashgov-marketplace-${VERSION}.zip"
|
|
261
|
+
|
|
262
|
+
- name: Upload Marketplace bundle artifact
|
|
263
|
+
uses: actions/upload-artifact@v4
|
|
264
|
+
with:
|
|
265
|
+
name: marketplace-bundle
|
|
266
|
+
path: dashgov-marketplace-*.zip
|
|
267
|
+
retention-days: 90
|
dash_gov-0.1.1/CLAUDE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# CLAUDE.md — dash-gov
|
|
2
|
+
|
|
3
|
+
Part of the **Dashlibs** suite. See ~/dashlibs for the full context.
|
|
4
|
+
|
|
5
|
+
## Purpose
|
|
6
|
+
PII scanning and UC column tagging. scanner.py=GovernanceScanner, uses regex patterns + keyword heuristics
|
|
7
|
+
|
|
8
|
+
## Structure
|
|
9
|
+
- `/ui.py` — ipywidgets UI, `launch()` entrypoint
|
|
10
|
+
- `/*.py` — core logic
|
|
11
|
+
- `tests/` — pytest, no Spark dependency for unit tests
|
|
12
|
+
|
|
13
|
+
## Key Design Rules
|
|
14
|
+
- Never import Spark at module level — always inside functions
|
|
15
|
+
- UI calls core classes; never contains business logic
|
|
16
|
+
- `launch()` is always the public entrypoint for business users
|
|
17
|
+
|
|
18
|
+
## CI
|
|
19
|
+
- `ci.yml` — PR gate: lint → test → build
|
|
20
|
+
- `daily.yml` — 06:00 UTC: tests + .health/log.txt commit
|
|
21
|
+
- `release.yml`— Monday 09:00 UTC: patch bump + GitHub release
|
dash_gov-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dash-gov
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Data lineage and governance for Databricks — table/column lineage, classification, and a built-in notebook UI
|
|
5
|
+
Project-URL: Homepage, https://github.com/dash-libs/dash-gov
|
|
6
|
+
Author-email: Darshan Shah <darshan.innovation@gmail.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: data-catalog,databricks,governance,lineage,unity-catalog
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Information Technology
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: ipywidgets>=8.0
|
|
21
|
+
Requires-Dist: sqlglot>=23.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
24
|
+
Requires-Dist: pdoc; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# DashGov — Databricks Library
|
|
31
|
+
|
|
32
|
+
[](https://github.com/dash-libs/dash-gov/actions)
|
|
33
|
+
[](https://pypi.org/project/dash-gov/)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
|
|
36
|
+
Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
%pip install dash-gov
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import dashgov
|
|
48
|
+
dashgov.launch() # Opens interactive UI in your Databricks notebook
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Part of Dashlibs
|
|
52
|
+
|
|
53
|
+
| Library | Purpose |
|
|
54
|
+
|---|---|
|
|
55
|
+
| dash-dq | Data Quality |
|
|
56
|
+
| dash-synthetic | Synthetic Data Generation |
|
|
57
|
+
| dash-ml | ML Model Monitoring |
|
|
58
|
+
| dash-ingest | Data Ingestion |
|
|
59
|
+
| dash-gov | Data Governance |
|
|
60
|
+
| dash-ontology | Ontology & Lineage for AI |
|
|
61
|
+
|
|
62
|
+
## License
|
|
63
|
+
|
|
64
|
+
Apache 2.0
|
dash_gov-0.1.1/README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# DashGov — Databricks Library
|
|
2
|
+
|
|
3
|
+
[](https://github.com/dash-libs/dash-gov/actions)
|
|
4
|
+
[](https://pypi.org/project/dash-gov/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
%pip install dash-gov
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import dashgov
|
|
19
|
+
dashgov.launch() # Opens interactive UI in your Databricks notebook
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Part of Dashlibs
|
|
23
|
+
|
|
24
|
+
| Library | Purpose |
|
|
25
|
+
|---|---|
|
|
26
|
+
| dash-dq | Data Quality |
|
|
27
|
+
| dash-synthetic | Synthetic Data Generation |
|
|
28
|
+
| dash-ml | ML Model Monitoring |
|
|
29
|
+
| dash-ingest | Data Ingestion |
|
|
30
|
+
| dash-gov | Data Governance |
|
|
31
|
+
| dash-ontology | Ontology & Lineage for AI |
|
|
32
|
+
|
|
33
|
+
## License
|
|
34
|
+
|
|
35
|
+
Apache 2.0
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""DashGov — Data lineage and governance for Databricks."""
|
|
2
|
+
from dashgov.lineage import LineageGraph, build_lineage_graph, fetch_uc_lineage
|
|
3
|
+
from dashgov.parser import parse_table_lineage, parse_column_lineage, parse_notebook_lineage
|
|
4
|
+
from dashgov.classifier import classify_table, classify_all
|
|
5
|
+
from dashgov.ui import launch
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.1"
|
|
8
|
+
__all__ = [
|
|
9
|
+
"LineageGraph",
|
|
10
|
+
"build_lineage_graph",
|
|
11
|
+
"fetch_uc_lineage",
|
|
12
|
+
"parse_table_lineage",
|
|
13
|
+
"parse_column_lineage",
|
|
14
|
+
"parse_notebook_lineage",
|
|
15
|
+
"classify_table",
|
|
16
|
+
"classify_all",
|
|
17
|
+
"launch",
|
|
18
|
+
]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Table role classification based on naming, schema shape, and lineage position.
|
|
3
|
+
|
|
4
|
+
Roles:
|
|
5
|
+
entity — root fact tables representing business objects (Customer, Order)
|
|
6
|
+
fact — transactional / event tables with FK refs to entities
|
|
7
|
+
junction — bridge tables expressing many:many relationships
|
|
8
|
+
aggregation — pre-computed summary / reporting tables
|
|
9
|
+
staging — intermediate / temp tables in a transformation pipeline
|
|
10
|
+
unknown — cannot be classified with confidence
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
# ── Name prefix/suffix patterns ───────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
_STAGING_PREFIXES = {"stg_", "staging_", "tmp_", "temp_", "raw_", "src_", "landing_", "bronze_"}
|
|
17
|
+
_DIMENSION_PREFIXES = {"dim_", "d_"}
|
|
18
|
+
_FACT_PREFIXES = {"fact_", "fct_", "f_"}
|
|
19
|
+
_AGG_SUFFIXES = {
|
|
20
|
+
"_agg", "_aggregated", "_summary", "_report",
|
|
21
|
+
"_metrics", "_stats", "_kpi", "_rollup", "_daily",
|
|
22
|
+
"_weekly", "_monthly", "_yearly",
|
|
23
|
+
}
|
|
24
|
+
_JUNCTION_SUFFIXES = {"_map", "_mapping", "_xref", "_bridge", "_link", "_rel", "_assoc", "_pivot"}
|
|
25
|
+
|
|
26
|
+
# Column names that strongly suggest a primary key
|
|
27
|
+
_PK_PATTERNS = {"id", "pk", "key", "uuid", "guid"}
|
|
28
|
+
# Column name endings that suggest a foreign key
|
|
29
|
+
_FK_SUFFIXES = ("_id", "_pk", "_key", "_fk", "_ref", "_uuid")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _name_lower(table_name: str) -> str:
|
|
33
|
+
"""Extract bare table name (no catalog/schema) and lowercase it."""
|
|
34
|
+
return table_name.split(".")[-1].lower()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _starts_with_any(name: str, prefixes: set[str]) -> bool:
|
|
38
|
+
return any(name.startswith(p) for p in prefixes)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ends_with_any(name: str, suffixes: set | tuple) -> bool:
|
|
42
|
+
return any(name.endswith(s) for s in suffixes)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def count_fk_columns(columns: list[dict]) -> int:
|
|
46
|
+
"""Count columns that look like foreign keys."""
|
|
47
|
+
return sum(
|
|
48
|
+
1 for c in columns
|
|
49
|
+
if c.get("name", "").lower() != "id"
|
|
50
|
+
and _ends_with_any(c.get("name", "").lower(), _FK_SUFFIXES)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def has_primary_key(columns: list[dict]) -> bool:
|
|
55
|
+
"""True if there's a column that looks like a primary key."""
|
|
56
|
+
names = {c.get("name", "").lower() for c in columns}
|
|
57
|
+
return bool(names & _PK_PATTERNS) or any(
|
|
58
|
+
n == "id" or _ends_with_any(n, ("_id",)) and len(n) <= 10
|
|
59
|
+
for n in names
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def classify_table(
|
|
64
|
+
full_name: str,
|
|
65
|
+
columns: list[dict],
|
|
66
|
+
n_upstream: int = 0,
|
|
67
|
+
n_downstream: int = 0,
|
|
68
|
+
) -> tuple[str, float]:
|
|
69
|
+
"""
|
|
70
|
+
Classify a table's role.
|
|
71
|
+
|
|
72
|
+
Returns (role: str, confidence: float).
|
|
73
|
+
|
|
74
|
+
confidence is in [0.0, 1.0]:
|
|
75
|
+
>= 0.85 → strong signal (name prefix, junction shape)
|
|
76
|
+
0.60–0.84 → moderate signal (position in lineage + shape)
|
|
77
|
+
< 0.60 → weak / unknown
|
|
78
|
+
"""
|
|
79
|
+
name = _name_lower(full_name)
|
|
80
|
+
n_cols = len(columns)
|
|
81
|
+
n_fk = count_fk_columns(columns)
|
|
82
|
+
has_pk = has_primary_key(columns)
|
|
83
|
+
|
|
84
|
+
# ── Staging ──
|
|
85
|
+
if _starts_with_any(name, _STAGING_PREFIXES):
|
|
86
|
+
return "staging", 0.90
|
|
87
|
+
|
|
88
|
+
# ── Aggregation ──
|
|
89
|
+
if _ends_with_any(name, _AGG_SUFFIXES):
|
|
90
|
+
return "aggregation", 0.90
|
|
91
|
+
if _starts_with_any(name, _FACT_PREFIXES) and n_upstream > 0:
|
|
92
|
+
return "aggregation", 0.75
|
|
93
|
+
|
|
94
|
+
# ── Dimension / Entity ──
|
|
95
|
+
if _starts_with_any(name, _DIMENSION_PREFIXES):
|
|
96
|
+
return "entity", 0.90
|
|
97
|
+
|
|
98
|
+
# ── Junction ──
|
|
99
|
+
if _ends_with_any(name, _JUNCTION_SUFFIXES):
|
|
100
|
+
return "junction", 0.88
|
|
101
|
+
if n_cols >= 2 and n_fk >= 2 and n_fk / max(n_cols, 1) >= 0.6:
|
|
102
|
+
# Mostly FK columns → junction/bridge table
|
|
103
|
+
return "junction", 0.80
|
|
104
|
+
|
|
105
|
+
# ── Entity ──
|
|
106
|
+
# Root source with a PK and meaningful columns
|
|
107
|
+
if n_upstream == 0 and has_pk and n_cols >= 3:
|
|
108
|
+
return "entity", 0.82
|
|
109
|
+
if n_upstream == 0 and n_cols >= 5:
|
|
110
|
+
return "entity", 0.65
|
|
111
|
+
|
|
112
|
+
# ── Fact ──
|
|
113
|
+
# Has upstream (transformed from somewhere) + FK columns
|
|
114
|
+
if n_upstream >= 1 and n_fk >= 1 and n_downstream >= 1:
|
|
115
|
+
return "fact", 0.70
|
|
116
|
+
if n_upstream >= 1 and n_fk >= 2:
|
|
117
|
+
return "fact", 0.65
|
|
118
|
+
|
|
119
|
+
# ── Aggregation by position ──
|
|
120
|
+
if n_upstream >= 2 and n_downstream == 0:
|
|
121
|
+
return "aggregation", 0.60
|
|
122
|
+
|
|
123
|
+
return "unknown", 0.40
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def classify_all(
|
|
127
|
+
tables: dict, # {full_name: {"columns": [...], "role": ...}}
|
|
128
|
+
upstream_counts: dict[str, int],
|
|
129
|
+
downstream_counts: dict[str, int],
|
|
130
|
+
) -> dict[str, tuple[str, float]]:
|
|
131
|
+
"""
|
|
132
|
+
Classify every table in the graph.
|
|
133
|
+
|
|
134
|
+
Returns {full_name: (role, confidence)}.
|
|
135
|
+
"""
|
|
136
|
+
return {
|
|
137
|
+
name: classify_table(
|
|
138
|
+
name,
|
|
139
|
+
info.get("columns", []),
|
|
140
|
+
upstream_counts.get(name, 0),
|
|
141
|
+
downstream_counts.get(name, 0),
|
|
142
|
+
)
|
|
143
|
+
for name, info in tables.items()
|
|
144
|
+
}
|