databricks-access-audit 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ """Databricks Access Audit.
2
+
3
+ Audit Databricks access across all workspaces — Unity Catalog permissions and workspace object ACLs.
4
+
5
+ Quick start::
6
+
7
+ from databricks_access_audit import create_client, GroupMembershipResolver
8
+
9
+ client = create_client(cloud="azure", client_id="...",
10
+ client_secret="...", account_id="...")
11
+ resolver = GroupMembershipResolver(client)
12
+ node = resolver.resolve_group("data-engineers")
13
+ """
14
+
15
+ __version__ = "0.18.1"
16
+
17
+ from databricks_access_audit._classification import build_member_lookups, classify_grant
18
+ from databricks_access_audit.catalog_scanner import CatalogPermissionScanner, classify_catalog_grant
19
+ from databricks_access_audit.client import (
20
+ AuditClient,
21
+ DatabricksAPIClient,
22
+ create_client,
23
+ )
24
+ from databricks_access_audit.csv_output import write_group_audit_csv, write_principal_audit_csv
25
+ from databricks_access_audit.elevate import PermissionElevator
26
+ from databricks_access_audit.escalation import ESCALATION_PRIVILEGES, detect_escalations
27
+ from databricks_access_audit.group_resolver import GroupMembershipResolver
28
+ from databricks_access_audit.local_groups import LocalGroupChecker
29
+ from databricks_access_audit.models import (
30
+ AuditDiff,
31
+ CatalogGrant,
32
+ EffectivePermission,
33
+ # Feature models
34
+ EscalationFinding,
35
+ GrantSource,
36
+ GroupMember,
37
+ # Principal audit models
38
+ GroupMembership,
39
+ GroupNode,
40
+ LocalGroupFinding,
41
+ MemberType,
42
+ PrincipalAuditResult,
43
+ PrincipalSource,
44
+ RedundancyLevel,
45
+ RedundancyResult,
46
+ SchemaGrant,
47
+ StaleFinding,
48
+ TableGrant,
49
+ WorkspaceInfo,
50
+ WorkspaceObjectGrant,
51
+ WorkspaceRole,
52
+ )
53
+ from databricks_access_audit.principal_auditor import PrincipalAuditor
54
+ from databricks_access_audit.redundancy import RedundancyDetector
55
+ from databricks_access_audit.revoke import RevokeScriptGenerator
56
+ from databricks_access_audit.schema_scanner import SchemaPermissionScanner
57
+ from databricks_access_audit.snapshot import (
58
+ build_group_snapshot,
59
+ build_principal_snapshot,
60
+ diff_snapshots,
61
+ load_snapshot,
62
+ save_snapshot,
63
+ )
64
+ from databricks_access_audit.stale_checker import StaleGrantChecker
65
+ from databricks_access_audit.table_scanner import TablePermissionScanner
66
+ from databricks_access_audit.workspace import WORKSPACE_DOMAIN_MAP, WorkspaceDiscovery
67
+ from databricks_access_audit.workspace_object_scanner import (
68
+ ALL_OBJECT_TYPES,
69
+ WorkspaceObjectScanner,
70
+ )
71
+
72
+ # Optional SDK client — only available when databricks-sdk is installed
73
+ try:
74
+ from databricks_access_audit.sdk_client import SDK_AVAILABLE, DatabricksSDKClient
75
+ except ImportError:
76
+ DatabricksSDKClient = None # type: ignore[assignment,misc]
77
+ SDK_AVAILABLE = False
78
+
79
+ __all__ = [
80
+ "PrincipalSource",
81
+ # Clients
82
+ "AuditClient",
83
+ "DatabricksAPIClient",
84
+ "DatabricksSDKClient",
85
+ "create_client",
86
+ "SDK_AVAILABLE",
87
+ # Core modules
88
+ "GroupMembershipResolver",
89
+ "WorkspaceDiscovery",
90
+ "CatalogPermissionScanner",
91
+ "SchemaPermissionScanner",
92
+ "TablePermissionScanner",
93
+ "WorkspaceObjectScanner",
94
+ "ALL_OBJECT_TYPES",
95
+ "RedundancyDetector",
96
+ "RevokeScriptGenerator",
97
+ "PrincipalAuditor",
98
+ "PermissionElevator",
99
+ "detect_escalations",
100
+ "ESCALATION_PRIVILEGES",
101
+ "StaleGrantChecker",
102
+ "LocalGroupChecker",
103
+ # Models
104
+ "MemberType",
105
+ "GroupMember",
106
+ "GroupNode",
107
+ "WorkspaceInfo",
108
+ "GrantSource",
109
+ "CatalogGrant",
110
+ "SchemaGrant",
111
+ "TableGrant",
112
+ "WorkspaceObjectGrant",
113
+ "RedundancyLevel",
114
+ "RedundancyResult",
115
+ "GroupMembership",
116
+ "WorkspaceRole",
117
+ "EffectivePermission",
118
+ "PrincipalAuditResult",
119
+ "EscalationFinding",
120
+ "StaleFinding",
121
+ "LocalGroupFinding",
122
+ "AuditDiff",
123
+ # CSV / snapshot
124
+ "write_group_audit_csv",
125
+ "write_principal_audit_csv",
126
+ "build_group_snapshot",
127
+ "build_principal_snapshot",
128
+ "save_snapshot",
129
+ "load_snapshot",
130
+ "diff_snapshots",
131
+ # Helpers
132
+ "WORKSPACE_DOMAIN_MAP",
133
+ "classify_catalog_grant",
134
+ "classify_grant",
135
+ "build_member_lookups",
136
+ ]
@@ -0,0 +1,7 @@
1
+ """Allow running as `python -m databricks_access_audit`."""
2
+
3
+ import sys
4
+
5
+ from databricks_access_audit.cli import main
6
+
7
+ sys.exit(main())
@@ -0,0 +1,111 @@
1
+ """Shared grant classification helpers used by all permission scanners."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, Optional, Set, Tuple
6
+
7
+ from databricks_access_audit.models import GrantSource, GroupMember
8
+
9
+
10
+ def build_member_lookups(
11
+ all_members: Dict[str, List[GroupMember]],
12
+ ) -> Tuple[Set[str], Set[str], Set[str], Set[str]]:
13
+ """Build lookup sets from a flat members dict for grant classification.
14
+
15
+ Returns ``(member_emails, member_display_names, sp_display_names, sp_app_ids)``.
16
+ Values are stored in their original case; :func:`classify_grant` performs
17
+ case-insensitive comparison where needed.
18
+ """
19
+ member_emails: Set[str] = set()
20
+ member_names: Set[str] = set()
21
+ sp_names: Set[str] = set()
22
+ sp_app_ids: Set[str] = set()
23
+
24
+ for u in all_members.get("users", []):
25
+ if u.email:
26
+ member_emails.add(u.email)
27
+ if u.display_name:
28
+ member_names.add(u.display_name)
29
+
30
+ for sp in all_members.get("service_principals", []):
31
+ if sp.display_name:
32
+ sp_names.add(sp.display_name)
33
+ if sp.application_id:
34
+ sp_app_ids.add(sp.application_id)
35
+
36
+ return member_emails, member_names, sp_names, sp_app_ids
37
+
38
+
39
+ def classify_grant(
40
+ principal: str,
41
+ target_group_name: str,
42
+ upstream_groups: Dict[str, str],
43
+ member_emails: Set[str],
44
+ member_names: Set[str],
45
+ sp_names: Set[str],
46
+ sp_app_ids: Set[str],
47
+ ) -> Optional[Tuple[GrantSource, str, Optional[str], bool]]:
48
+ """Classify a single UC grant relative to the target group.
49
+
50
+ Returns ``(source, principal_type, inherited_from, member_of_target)`` or
51
+ ``None`` when the principal is completely unrelated to the target group.
52
+
53
+ Handles several real-world Databricks quirks:
54
+
55
+ * Backtick-quoted principal names that the API occasionally returns.
56
+ * Case-insensitive email matching — Azure AD normalises email casing
57
+ differently across APIs, so ``alice@corp.com`` and ``Alice@Corp.com``
58
+ should be treated as the same identity.
59
+ * Display-name grants — some setups grant privileges to a user's
60
+ display name rather than their email address.
61
+ * Service principals identified by either display name or application ID.
62
+ """
63
+ if not principal or not principal.strip():
64
+ return None
65
+
66
+ # Strip backtick quoting Databricks sometimes wraps around principal names
67
+ clean = principal.replace("`", "").strip()
68
+
69
+ # ------------------------------------------------------------------ #
70
+ # 1. Direct — the target group itself holds this privilege #
71
+ # ------------------------------------------------------------------ #
72
+ if clean == target_group_name or principal == target_group_name:
73
+ return GrantSource.DIRECT, "GROUP", None, False
74
+
75
+ # ------------------------------------------------------------------ #
76
+ # 2. Upstream — a parent / ancestor group of the target holds it #
77
+ # ------------------------------------------------------------------ #
78
+ if principal in upstream_groups or clean in upstream_groups:
79
+ resolved = principal if principal in upstream_groups else clean
80
+ return GrantSource.UPSTREAM, "GROUP", resolved, False
81
+
82
+ clean_lower = clean.lower()
83
+
84
+ # ------------------------------------------------------------------ #
85
+ # 3. Member — user identified by email (case-insensitive) #
86
+ # ------------------------------------------------------------------ #
87
+ if clean in member_emails or any(e.lower() == clean_lower for e in member_emails):
88
+ return GrantSource.MEMBER_DIRECT, "USER", None, True
89
+
90
+ # ------------------------------------------------------------------ #
91
+ # 4. Member — user identified by display name #
92
+ # Some Databricks setups (especially with AAD sync) grant using #
93
+ # the display name rather than the email address. #
94
+ # ------------------------------------------------------------------ #
95
+ if clean in member_names or any(n.lower() == clean_lower for n in member_names):
96
+ return GrantSource.MEMBER_DIRECT, "USER", None, True
97
+
98
+ # ------------------------------------------------------------------ #
99
+ # 5. Member — service principal identified by display name #
100
+ # ------------------------------------------------------------------ #
101
+ if clean in sp_names or any(n.lower() == clean_lower for n in sp_names):
102
+ return GrantSource.MEMBER_DIRECT, "SERVICE_PRINCIPAL", None, True
103
+
104
+ # ------------------------------------------------------------------ #
105
+ # 6. Member — service principal identified by application / client ID #
106
+ # The application ID is case-sensitive (it's a GUID). #
107
+ # ------------------------------------------------------------------ #
108
+ if clean in sp_app_ids:
109
+ return GrantSource.MEMBER_DIRECT, "SERVICE_PRINCIPAL", None, True
110
+
111
+ return None
@@ -0,0 +1,222 @@
1
+ """Cross-workspace catalog permission scanner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections import deque
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import Dict, List, Optional, Set
9
+
10
+ from databricks_access_audit._classification import build_member_lookups, classify_grant
11
+ from databricks_access_audit.client import AuditClient
12
+ from databricks_access_audit.group_resolver import GroupMembershipResolver
13
+ from databricks_access_audit.models import (
14
+ CatalogGrant,
15
+ GroupMember,
16
+ GroupNode,
17
+ WorkspaceInfo,
18
+ )
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ def classify_catalog_grant(
24
+ principal: str,
25
+ privileges: List[str],
26
+ catalog_name: str,
27
+ workspace: WorkspaceInfo,
28
+ target_group_name: str,
29
+ upstream_groups: Dict[str, str],
30
+ member_emails: set,
31
+ member_names: set,
32
+ sp_names: set,
33
+ sp_app_ids: set,
34
+ ) -> Optional[CatalogGrant]:
35
+ """Classify a grant as Direct / Upstream / Member Direct and wrap in CatalogGrant."""
36
+ result = classify_grant(
37
+ principal, target_group_name, upstream_groups,
38
+ member_emails, member_names, sp_names, sp_app_ids,
39
+ )
40
+ if result is None:
41
+ return None
42
+ source, ptype, inherited, member = result
43
+ return CatalogGrant(
44
+ catalog_name=catalog_name,
45
+ workspace_name=workspace.workspace_name,
46
+ workspace_url=workspace.workspace_url,
47
+ principal=principal,
48
+ principal_type=ptype,
49
+ privileges=privileges,
50
+ grant_source=source,
51
+ inherited_from=inherited,
52
+ member_of_target=member,
53
+ )
54
+
55
+
56
+ class CatalogPermissionScanner:
57
+ """Scan catalog permissions across workspaces.
58
+
59
+ Each workspace is scanned independently so that workspace-catalog bindings
60
+ are respected: the same catalog name can be attached to different subsets
61
+ of workspaces, and must be scanned from every workspace that can see it.
62
+
63
+ Duplicate workspace URLs in the input list are silently deduplicated by
64
+ :meth:`scan_all_workspaces` before dispatch. Within a single
65
+ :meth:`scan_workspace` call, duplicate catalog names from the UC API
66
+ response are skipped via a local seen-set.
67
+ """
68
+
69
+ def __init__(self, api_client: AuditClient, group_resolver: GroupMembershipResolver):
70
+ self.api_client = api_client
71
+ self.group_resolver = group_resolver
72
+
73
+ def _get_catalogs(self, workspace: WorkspaceInfo) -> List[dict]:
74
+ try:
75
+ return self.api_client.workspace_api(
76
+ workspace.workspace_url, "GET", "/api/2.1/unity-catalog/catalogs"
77
+ ).get("catalogs", [])
78
+ except Exception as exc:
79
+ log.warning(
80
+ "Failed to list catalogs for workspace %s: %s", workspace.workspace_name, exc
81
+ )
82
+ return []
83
+
84
+ def _get_catalog_grants(self, workspace: WorkspaceInfo, catalog_name: str) -> List[dict]:
85
+ try:
86
+ resp = self.api_client.workspace_api(
87
+ workspace.workspace_url, "GET",
88
+ f"/api/2.1/unity-catalog/permissions/catalog/{catalog_name}",
89
+ )
90
+ return resp.get("privilege_assignments") or []
91
+ except Exception as exc:
92
+ log.warning(
93
+ "Failed to get grants for catalog %s on workspace %s: %s",
94
+ catalog_name, workspace.workspace_name, exc,
95
+ )
96
+ return []
97
+
98
+ def get_groups_containing_target(self, target_group_name: str) -> Dict[str, str]:
99
+ """Find ALL upstream (ancestor) groups of the target via BFS.
100
+
101
+ Delegates the O(N) group-membership fetch to
102
+ :meth:`~databricks_access_audit.group_resolver.GroupMembershipResolver.get_group_membership_map`,
103
+ which parallelises the individual GETs and caches the result for the
104
+ lifetime of the resolver instance. Multiple callers within the same
105
+ audit session (catalog scanner, schema scanner, principal auditor) share
106
+ the cached map without redundant API calls.
107
+ """
108
+ id_to_name, _, child_to_parents = self.group_resolver.get_group_membership_map()
109
+
110
+ target_id = next(
111
+ (gid for gid, name in id_to_name.items() if name == target_group_name), None
112
+ )
113
+ if not target_id:
114
+ return {}
115
+
116
+ upstream: Dict[str, str] = {}
117
+ queue: deque = deque([target_id])
118
+ visited = {target_id}
119
+
120
+ while queue:
121
+ current = queue.popleft()
122
+ for parent_id in child_to_parents.get(current, set()):
123
+ if parent_id not in visited:
124
+ visited.add(parent_id)
125
+ upstream[id_to_name.get(parent_id, parent_id)] = parent_id
126
+ queue.append(parent_id)
127
+
128
+ return upstream
129
+
130
+ def scan_workspace(
131
+ self,
132
+ workspace: WorkspaceInfo,
133
+ target_group_name: str,
134
+ group_node: GroupNode,
135
+ all_members: Dict[str, List[GroupMember]],
136
+ upstream_groups: Optional[Dict[str, str]] = None,
137
+ ) -> List[CatalogGrant]:
138
+ """Scan a single workspace for catalog grants related to target_group_name.
139
+
140
+ Parameters
141
+ ----------
142
+ upstream_groups:
143
+ Pre-computed ancestor group map from :meth:`get_groups_containing_target`.
144
+ When *None* the map is computed on demand (adds one SCIM list call).
145
+ Pass it explicitly when scanning multiple workspaces to avoid N+1 fetches.
146
+ """
147
+ if upstream_groups is None:
148
+ upstream_groups = self.get_groups_containing_target(target_group_name)
149
+
150
+ grants: List[CatalogGrant] = []
151
+ catalogs = self._get_catalogs(workspace)
152
+ lookups = build_member_lookups(all_members)
153
+ seen_names: Set[str] = set() # guard against duplicate catalog names in the API response
154
+
155
+ for cat in catalogs:
156
+ name = cat.get("name", "")
157
+ if not name or name in seen_names:
158
+ continue
159
+ seen_names.add(name)
160
+
161
+ for g in self._get_catalog_grants(workspace, name):
162
+ privs = g.get("privileges") or []
163
+ if not privs:
164
+ continue
165
+ obj = classify_catalog_grant(
166
+ g.get("principal", ""), privs, name, workspace,
167
+ target_group_name, upstream_groups, *lookups,
168
+ )
169
+ if obj:
170
+ grants.append(obj)
171
+ return grants
172
+
173
+ def scan_all_workspaces(
174
+ self,
175
+ workspaces: List[WorkspaceInfo],
176
+ target_group_name: str,
177
+ group_node: GroupNode,
178
+ all_members: Dict[str, List[GroupMember]],
179
+ max_workers: int = 8,
180
+ ) -> List[CatalogGrant]:
181
+ """Scan all workspaces in parallel, computing upstream groups exactly once.
182
+
183
+ Duplicate workspace URLs are silently deduplicated before dispatch so
184
+ that a workspace listed more than once is only scanned once.
185
+ Workers are capped at the number of unique workspaces to avoid
186
+ spawning idle threads.
187
+ """
188
+ # Fetch upstream groups once — the SCIM hierarchy is account-level and
189
+ # does not change between workspaces, so fetching N times would be wasteful.
190
+ upstream_groups = self.get_groups_containing_target(target_group_name)
191
+
192
+ # Deduplicate by URL while preserving order.
193
+ seen_urls: Set[str] = set()
194
+ unique_workspaces: List[WorkspaceInfo] = []
195
+ for ws in workspaces:
196
+ if ws.workspace_url not in seen_urls:
197
+ seen_urls.add(ws.workspace_url)
198
+ unique_workspaces.append(ws)
199
+
200
+ n = len(unique_workspaces)
201
+ if n == 0:
202
+ return []
203
+
204
+ all_grants: List[CatalogGrant] = []
205
+ workers = min(max_workers, n)
206
+
207
+ with ThreadPoolExecutor(max_workers=workers) as pool:
208
+ futures = {
209
+ pool.submit(
210
+ self.scan_workspace,
211
+ ws, target_group_name, group_node, all_members, upstream_groups,
212
+ ): ws
213
+ for ws in unique_workspaces
214
+ }
215
+ for fut in as_completed(futures):
216
+ ws = futures[fut]
217
+ try:
218
+ all_grants.extend(fut.result())
219
+ except Exception as exc:
220
+ log.warning("Skipping workspace %s due to error: %s", ws.workspace_name, exc)
221
+
222
+ return all_grants