dash-gov 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ - run: pip install ruff
18
+ - run: ruff check dashgov/
19
+
20
+ test:
21
+ runs-on: ubuntu-latest
22
+ needs: lint
23
+ strategy:
24
+ matrix:
25
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: actions/setup-python@v5
29
+ with:
30
+ python-version: ${{ matrix.python-version }}
31
+ - name: Install
32
+ run: pip install -e ".[dev]" pytest pytest-cov
33
+ - name: Test
34
+ run: pytest tests/ -v --cov=dashgov --cov-report=xml
35
+ - name: Upload coverage
36
+ uses: codecov/codecov-action@v4
37
+ with:
38
+ files: coverage.xml
39
+
40
+ build:
41
+ runs-on: ubuntu-latest
42
+ needs: test
43
+ steps:
44
+ - uses: actions/checkout@v4
45
+ - uses: actions/setup-python@v5
46
+ with:
47
+ python-version: "3.11"
48
+ - run: pip install hatch
49
+ - run: hatch build
50
+ - uses: actions/upload-artifact@v4
51
+ with:
52
+ name: dist
53
+ path: dist/
@@ -0,0 +1,33 @@
1
+ name: Daily Tests
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 6 * * *" # Every day 06:00 UTC — tests only, no commit
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ test:
10
+ name: Test (Python ${{ matrix.python-version }})
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+
23
+ - name: Install
24
+ run: pip install -e ".[dev]" pytest pytest-cov
25
+
26
+ - name: Run tests
27
+ run: pytest tests/ -v --cov=dashgov --cov-report=xml --cov-report=term-missing
28
+
29
+ - name: Upload coverage
30
+ uses: codecov/codecov-action@v4
31
+ with:
32
+ files: coverage.xml
33
+ fail_ci_if_error: false
@@ -0,0 +1,267 @@
1
+ name: Weekly Release
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 9 * * 1" # Every Monday 09:00 UTC
6
+ workflow_dispatch:
7
+ inputs:
8
+ release_note:
9
+ description: "Optional release note (shown in GitHub release body)"
10
+ required: false
11
+ default: ""
12
+
13
+ jobs:
14
+ # ── Gate: tests must pass ────────────────────────────────────────────────
15
+ test:
16
+ name: Test (Python ${{ matrix.python-version }})
17
+ runs-on: ubuntu-latest
18
+ strategy:
19
+ fail-fast: true
20
+ matrix:
21
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+
29
+ - name: Install
30
+ run: pip install -e ".[dev]" pytest pytest-cov
31
+
32
+ - name: Run tests
33
+ run: pytest tests/ -v --cov=dashgov --cov-report=xml --cov-report=term-missing
34
+
35
+ - name: Upload coverage
36
+ uses: codecov/codecov-action@v4
37
+ with:
38
+ files: coverage.xml
39
+ fail_ci_if_error: false
40
+
41
+ # ── Generate docs ────────────────────────────────────────────────────────
42
+ docs:
43
+ name: Generate API docs
44
+ runs-on: ubuntu-latest
45
+ needs: test
46
+ steps:
47
+ - uses: actions/checkout@v4
48
+
49
+ - uses: actions/setup-python@v5
50
+ with:
51
+ python-version: "3.11"
52
+
53
+ - name: Install
54
+ run: pip install -e ".[dev]" pdoc
55
+
56
+ - name: Generate docs
57
+ run: |
58
+ pdoc dashgov --output-dir docs/api --docformat google
59
+ echo "Docs generated at $(date -u)" > docs/api/.generated
60
+
61
+ - name: Upload docs artifact
62
+ uses: actions/upload-artifact@v4
63
+ with:
64
+ name: api-docs
65
+ path: docs/api/
66
+
67
+ # ── Release: tag, GitHub release, commit docs ────────────────────────────
68
+ release:
69
+ name: Bump version & release
70
+ runs-on: ubuntu-latest
71
+ needs: [test, docs]
72
+ permissions:
73
+ contents: write
74
+ outputs:
75
+ version: ${{ steps.bump.outputs.version }}
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ with:
79
+ fetch-depth: 0
80
+
81
+ - uses: actions/setup-python@v5
82
+ with:
83
+ python-version: "3.11"
84
+
85
+ - name: Install build tools
86
+ run: pip install hatch pdoc
87
+
88
+ - name: Bump patch version
89
+ id: bump
90
+ run: |
91
+ current=$(hatch version)
92
+ hatch version patch
93
+ new=$(hatch version)
94
+ echo "version=$new" >> $GITHUB_OUTPUT
95
+ echo "prev_version=$current" >> $GITHUB_OUTPUT
96
+ echo "Bumped $current → $new"
97
+
98
+ - name: Regenerate docs into repo
99
+ run: |
100
+ pip install -e ".[dev]"
101
+ pdoc dashgov --output-dir docs/api --docformat google
102
+
103
+ - name: Build wheel + sdist
104
+ run: hatch build
105
+
106
+ - name: Write release notes
107
+ env:
108
+ VERSION: ${{ steps.bump.outputs.version }}
109
+ PREV_VERSION: ${{ steps.bump.outputs.prev_version }}
110
+ RELEASE_NOTE: ${{ github.event.inputs.release_note }}
111
+ run: |
112
+ cat > RELEASE_NOTES.md << EOF
113
+ ## DashGov — Data Governance v${VERSION}
114
+
115
+ **Released:** $(date -u '+%Y-%m-%d')
116
+ **Previous:** v${PREV_VERSION}
117
+
118
+ $( [ -n "${RELEASE_NOTE}" ] && echo "### Notes" && echo "${RELEASE_NOTE}" || true )
119
+
120
+ ### What's included
121
+ - All tests passing across Python 3.9, 3.10, 3.11, 3.12
122
+ - API documentation regenerated (see \`docs/api/\`)
123
+ - Published to PyPI and Databricks Marketplace
124
+
125
+ ### Install
126
+ \`\`\`bash
127
+ pip install dash-gov==${VERSION}
128
+ \`\`\`
129
+
130
+ ### Quick Start (Databricks notebook)
131
+ \`\`\`python
132
+ %pip install dash-gov==${VERSION}
133
+ import dashgov
134
+ dashgov.launch()
135
+ \`\`\`
136
+ EOF
137
+
138
+ - name: Commit version bump + docs to a release branch
139
+ env:
140
+ VERSION: ${{ steps.bump.outputs.version }}
141
+ run: |
142
+ git config user.name "github-actions[bot]"
143
+ git config user.email "github-actions[bot]@users.noreply.github.com"
144
+ git push origin --delete "refs/tags/v${VERSION}" 2>/dev/null || true
145
+ git push origin --delete "release/v${VERSION}" 2>/dev/null || true
146
+ git tag -d "v${VERSION}" 2>/dev/null || true
147
+ git checkout -b "release/v${VERSION}"
148
+ git add .
149
+ git commit -m "release: v${VERSION} — tests passed, docs updated"
150
+ git tag "v${VERSION}"
151
+ git push origin "release/v${VERSION}"
152
+ git push origin "v${VERSION}"
153
+
154
+ - name: Open and auto-merge release PR
155
+ continue-on-error: true
156
+ env:
157
+ GH_TOKEN: ${{ secrets.RELEASE_TOKEN || github.token }}
158
+ VERSION: ${{ steps.bump.outputs.version }}
159
+ run: |
160
+ gh pr create --base main --head "release/v${VERSION}" \
161
+ --title "release: v${VERSION}" \
162
+ --body "Automated release PR — tests passed, docs regenerated, version bumped to v${VERSION}." \
163
+ 2>/dev/null || true
164
+ gh pr merge "release/v${VERSION}" --merge --admin --delete-branch
165
+
166
+ - name: Create GitHub Release
167
+ uses: softprops/action-gh-release@v2
168
+ with:
169
+ tag_name: "v${{ steps.bump.outputs.version }}"
170
+ body_path: RELEASE_NOTES.md
171
+ files: dist/*
172
+
173
+ - name: Upload dist artifact for PyPI job
174
+ uses: actions/upload-artifact@v4
175
+ with:
176
+ name: dist
177
+ path: dist/
178
+
179
+ # ── Publish to PyPI (Trusted Publisher / OIDC — no token needed) ─────────
180
+ publish-pypi:
181
+ name: Publish to PyPI
182
+ runs-on: ubuntu-latest
183
+ needs: release
184
+ permissions:
185
+ id-token: write # required for OIDC trusted publisher
186
+ environment:
187
+ name: pypi
188
+ url: https://pypi.org/project/dash-gov
189
+ steps:
190
+ - name: Download dist
191
+ uses: actions/download-artifact@v4
192
+ with:
193
+ name: dist
194
+ path: dist/
195
+
196
+ - name: Publish to PyPI
197
+ uses: pypa/gh-action-pypi-publish@release/v1
198
+
199
+ # ── Package for Databricks Marketplace ───────────────────────────────────
200
+ publish-databricks:
201
+ name: Package for Databricks Marketplace
202
+ runs-on: ubuntu-latest
203
+ needs: release
204
+ steps:
205
+ - uses: actions/checkout@v4
206
+ with:
207
+ ref: "v${{ needs.release.outputs.version }}"
208
+
209
+ - name: Download dist
210
+ uses: actions/download-artifact@v4
211
+ with:
212
+ name: dist
213
+ path: dist/
214
+
215
+ - name: Build Marketplace bundle
216
+ env:
217
+ VERSION: ${{ needs.release.outputs.version }}
218
+ run: |
219
+ mkdir -p marketplace-bundle/files
220
+
221
+ # Copy wheel
222
+ cp dist/*.whl marketplace-bundle/files/
223
+
224
+ # Generate companion notebook
225
+ cat > marketplace-bundle/files/DashGov — Data Governance_Quickstart.py << NBEOF
226
+ # Databricks notebook source
227
+ # MAGIC %md
228
+ # MAGIC # DashGov — Data Governance v${VERSION} for Databricks
229
+ # MAGIC Install and launch the interactive UI.
230
+
231
+ # COMMAND ----------
232
+ # MAGIC %pip install dash-gov==${VERSION}
233
+
234
+ # COMMAND ----------
235
+ dbutils.library.restartPython()
236
+
237
+ # COMMAND ----------
238
+ import dashgov
239
+ dashgov.launch()
240
+ NBEOF
241
+
242
+ # Generate listing metadata
243
+ cat > marketplace-bundle/listing.json << LEOF
244
+ {
245
+ "listing_name": "DashGov — Data Governance",
246
+ "version": "${VERSION}",
247
+ "short_description": "Scan tables for PII and apply Unity Catalog sensitivity tags",
248
+ "long_description": "DashGov — Data Governance provides an ipywidgets UI inside Databricks notebooks to scan Unity Catalog tables for PII and apply sensitivity tags. No coding required for business users.",
249
+ "categories": ["Data Governance,Compliance"],
250
+ "tags": ["governance", "pii", "databricks", "unity-catalog", "pyspark"],
251
+ "provider": "dash-libs",
252
+ "documentation_url": "https://github.com/dash-libs/dash-gov",
253
+ "source_url": "https://github.com/dash-libs/dash-gov",
254
+ "pypi_package": "dash-gov==${VERSION}"
255
+ }
256
+ LEOF
257
+
258
+ # Zip the bundle
259
+ cd marketplace-bundle && zip -r ../dashgov-marketplace-${VERSION}.zip .
260
+ echo "Bundle created: dashgov-marketplace-${VERSION}.zip"
261
+
262
+ - name: Upload Marketplace bundle artifact
263
+ uses: actions/upload-artifact@v4
264
+ with:
265
+ name: marketplace-bundle
266
+ path: dashgov-marketplace-*.zip
267
+ retention-days: 90
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .coverage
7
+ coverage.xml
8
+ .pytest_cache/
9
+ .ruff_cache/
@@ -0,0 +1,21 @@
1
+ # CLAUDE.md — dash-gov
2
+
3
+ Part of the **Dashlibs** suite. See ~/dashlibs for the full context.
4
+
5
+ ## Purpose
6
+ PII scanning and UC column tagging. scanner.py=GovernanceScanner, uses regex patterns + keyword heuristics
7
+
8
+ ## Structure
9
+ - `/ui.py` — ipywidgets UI, `launch()` entrypoint
10
+ - `/*.py` — core logic
11
+ - `tests/` — pytest, no Spark dependency for unit tests
12
+
13
+ ## Key Design Rules
14
+ - Never import Spark at module level — always inside functions
15
+ - UI calls core classes; never contains business logic
16
+ - `launch()` is always the public entrypoint for business users
17
+
18
+ ## CI
19
+ - `ci.yml` — PR gate: lint → test → build
20
+ - `daily.yml` — 06:00 UTC: tests + .health/log.txt commit
21
+ - `release.yml`— Monday 09:00 UTC: patch bump + GitHub release
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: dash-gov
3
+ Version: 0.1.1
4
+ Summary: Data lineage and governance for Databricks — table/column lineage, classification, and a built-in notebook UI
5
+ Project-URL: Homepage, https://github.com/dash-libs/dash-gov
6
+ Author-email: Darshan Shah <darshan.innovation@gmail.com>
7
+ License: Apache-2.0
8
+ Keywords: data-catalog,databricks,governance,lineage,unity-catalog
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Information Technology
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.9
20
+ Requires-Dist: ipywidgets>=8.0
21
+ Requires-Dist: sqlglot>=23.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: hatch; extra == 'dev'
24
+ Requires-Dist: pdoc; extra == 'dev'
25
+ Requires-Dist: pytest; extra == 'dev'
26
+ Requires-Dist: pytest-cov; extra == 'dev'
27
+ Requires-Dist: ruff; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # DashGov — Databricks Library
31
+
32
+ [![CI](https://github.com/dash-libs/dash-gov/actions/workflows/ci.yml/badge.svg)](https://github.com/dash-libs/dash-gov/actions)
33
+ [![PyPI](https://img.shields.io/pypi/v/dash-gov)](https://pypi.org/project/dash-gov/)
34
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
35
+
36
+ Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ %pip install dash-gov
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import dashgov
48
+ dashgov.launch() # Opens interactive UI in your Databricks notebook
49
+ ```
50
+
51
+ ## Part of Dashlibs
52
+
53
+ | Library | Purpose |
54
+ |---|---|
55
+ | dash-dq | Data Quality |
56
+ | dash-synthetic | Synthetic Data Generation |
57
+ | dash-ml | ML Model Monitoring |
58
+ | dash-ingest | Data Ingestion |
59
+ | dash-gov | Data Governance |
60
+ | dash-ontology | Ontology & Lineage for AI |
61
+
62
+ ## License
63
+
64
+ Apache 2.0
@@ -0,0 +1,35 @@
1
+ # DashGov — Databricks Library
2
+
3
+ [![CI](https://github.com/dash-libs/dash-gov/actions/workflows/ci.yml/badge.svg)](https://github.com/dash-libs/dash-gov/actions)
4
+ [![PyPI](https://img.shields.io/pypi/v/dash-gov)](https://pypi.org/project/dash-gov/)
5
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
6
+
7
+ Part of the **[Dashlibs](https://github.com/dash-libs)** suite — Databricks libraries built for business users.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ %pip install dash-gov
13
+ ```
14
+
15
+ ## Quick Start
16
+
17
+ ```python
18
+ import dashgov
19
+ dashgov.launch() # Opens interactive UI in your Databricks notebook
20
+ ```
21
+
22
+ ## Part of Dashlibs
23
+
24
+ | Library | Purpose |
25
+ |---|---|
26
+ | dash-dq | Data Quality |
27
+ | dash-synthetic | Synthetic Data Generation |
28
+ | dash-ml | ML Model Monitoring |
29
+ | dash-ingest | Data Ingestion |
30
+ | dash-gov | Data Governance |
31
+ | dash-ontology | Ontology & Lineage for AI |
32
+
33
+ ## License
34
+
35
+ Apache 2.0
@@ -0,0 +1,18 @@
1
+ """DashGov — Data lineage and governance for Databricks."""
2
+ from dashgov.lineage import LineageGraph, build_lineage_graph, fetch_uc_lineage
3
+ from dashgov.parser import parse_table_lineage, parse_column_lineage, parse_notebook_lineage
4
+ from dashgov.classifier import classify_table, classify_all
5
+ from dashgov.ui import launch
6
+
7
+ __version__ = "0.1.1"
8
+ __all__ = [
9
+ "LineageGraph",
10
+ "build_lineage_graph",
11
+ "fetch_uc_lineage",
12
+ "parse_table_lineage",
13
+ "parse_column_lineage",
14
+ "parse_notebook_lineage",
15
+ "classify_table",
16
+ "classify_all",
17
+ "launch",
18
+ ]
@@ -0,0 +1,144 @@
1
+ """
2
+ Table role classification based on naming, schema shape, and lineage position.
3
+
4
+ Roles:
5
+ entity — root fact tables representing business objects (Customer, Order)
6
+ fact — transactional / event tables with FK refs to entities
7
+ junction — bridge tables expressing many:many relationships
8
+ aggregation — pre-computed summary / reporting tables
9
+ staging — intermediate / temp tables in a transformation pipeline
10
+ unknown — cannot be classified with confidence
11
+ """
12
+ from __future__ import annotations
13
+
14
+ # ── Name prefix/suffix patterns ───────────────────────────────────────────────
15
+
16
+ _STAGING_PREFIXES = {"stg_", "staging_", "tmp_", "temp_", "raw_", "src_", "landing_", "bronze_"}
17
+ _DIMENSION_PREFIXES = {"dim_", "d_"}
18
+ _FACT_PREFIXES = {"fact_", "fct_", "f_"}
19
+ _AGG_SUFFIXES = {
20
+ "_agg", "_aggregated", "_summary", "_report",
21
+ "_metrics", "_stats", "_kpi", "_rollup", "_daily",
22
+ "_weekly", "_monthly", "_yearly",
23
+ }
24
+ _JUNCTION_SUFFIXES = {"_map", "_mapping", "_xref", "_bridge", "_link", "_rel", "_assoc", "_pivot"}
25
+
26
+ # Column names that strongly suggest a primary key
27
+ _PK_PATTERNS = {"id", "pk", "key", "uuid", "guid"}
28
+ # Column name endings that suggest a foreign key
29
+ _FK_SUFFIXES = ("_id", "_pk", "_key", "_fk", "_ref", "_uuid")
30
+
31
+
32
+ def _name_lower(table_name: str) -> str:
33
+ """Extract bare table name (no catalog/schema) and lowercase it."""
34
+ return table_name.split(".")[-1].lower()
35
+
36
+
37
+ def _starts_with_any(name: str, prefixes: set[str]) -> bool:
38
+ return any(name.startswith(p) for p in prefixes)
39
+
40
+
41
+ def _ends_with_any(name: str, suffixes: set | tuple) -> bool:
42
+ return any(name.endswith(s) for s in suffixes)
43
+
44
+
45
+ def count_fk_columns(columns: list[dict]) -> int:
46
+ """Count columns that look like foreign keys."""
47
+ return sum(
48
+ 1 for c in columns
49
+ if c.get("name", "").lower() != "id"
50
+ and _ends_with_any(c.get("name", "").lower(), _FK_SUFFIXES)
51
+ )
52
+
53
+
54
+ def has_primary_key(columns: list[dict]) -> bool:
55
+ """True if there's a column that looks like a primary key."""
56
+ names = {c.get("name", "").lower() for c in columns}
57
+ return bool(names & _PK_PATTERNS) or any(
58
+ n == "id" or _ends_with_any(n, ("_id",)) and len(n) <= 10
59
+ for n in names
60
+ )
61
+
62
+
63
+ def classify_table(
64
+ full_name: str,
65
+ columns: list[dict],
66
+ n_upstream: int = 0,
67
+ n_downstream: int = 0,
68
+ ) -> tuple[str, float]:
69
+ """
70
+ Classify a table's role.
71
+
72
+ Returns (role: str, confidence: float).
73
+
74
+ confidence is in [0.0, 1.0]:
75
+ >= 0.85 → strong signal (name prefix, junction shape)
76
+ 0.60–0.84 → moderate signal (position in lineage + shape)
77
+ < 0.60 → weak / unknown
78
+ """
79
+ name = _name_lower(full_name)
80
+ n_cols = len(columns)
81
+ n_fk = count_fk_columns(columns)
82
+ has_pk = has_primary_key(columns)
83
+
84
+ # ── Staging ──
85
+ if _starts_with_any(name, _STAGING_PREFIXES):
86
+ return "staging", 0.90
87
+
88
+ # ── Aggregation ──
89
+ if _ends_with_any(name, _AGG_SUFFIXES):
90
+ return "aggregation", 0.90
91
+ if _starts_with_any(name, _FACT_PREFIXES) and n_upstream > 0:
92
+ return "aggregation", 0.75
93
+
94
+ # ── Dimension / Entity ──
95
+ if _starts_with_any(name, _DIMENSION_PREFIXES):
96
+ return "entity", 0.90
97
+
98
+ # ── Junction ──
99
+ if _ends_with_any(name, _JUNCTION_SUFFIXES):
100
+ return "junction", 0.88
101
+ if n_cols >= 2 and n_fk >= 2 and n_fk / max(n_cols, 1) >= 0.6:
102
+ # Mostly FK columns → junction/bridge table
103
+ return "junction", 0.80
104
+
105
+ # ── Entity ──
106
+ # Root source with a PK and meaningful columns
107
+ if n_upstream == 0 and has_pk and n_cols >= 3:
108
+ return "entity", 0.82
109
+ if n_upstream == 0 and n_cols >= 5:
110
+ return "entity", 0.65
111
+
112
+ # ── Fact ──
113
+ # Has upstream (transformed from somewhere) + FK columns
114
+ if n_upstream >= 1 and n_fk >= 1 and n_downstream >= 1:
115
+ return "fact", 0.70
116
+ if n_upstream >= 1 and n_fk >= 2:
117
+ return "fact", 0.65
118
+
119
+ # ── Aggregation by position ──
120
+ if n_upstream >= 2 and n_downstream == 0:
121
+ return "aggregation", 0.60
122
+
123
+ return "unknown", 0.40
124
+
125
+
126
+ def classify_all(
127
+ tables: dict, # {full_name: {"columns": [...], "role": ...}}
128
+ upstream_counts: dict[str, int],
129
+ downstream_counts: dict[str, int],
130
+ ) -> dict[str, tuple[str, float]]:
131
+ """
132
+ Classify every table in the graph.
133
+
134
+ Returns {full_name: (role, confidence)}.
135
+ """
136
+ return {
137
+ name: classify_table(
138
+ name,
139
+ info.get("columns", []),
140
+ upstream_counts.get(name, 0),
141
+ downstream_counts.get(name, 0),
142
+ )
143
+ for name, info in tables.items()
144
+ }