PyPI - databricks-access-audit - Versions diffs - 0.18.1__py3-none-any.whl - Mend

databricks-access-audit 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

databricks_access_audit/__init__.py +136 -0
databricks_access_audit/__main__.py +7 -0
databricks_access_audit/_classification.py +111 -0
databricks_access_audit/catalog_scanner.py +222 -0
databricks_access_audit/cli.py +848 -0
databricks_access_audit/client.py +377 -0
databricks_access_audit/csv_output.py +150 -0
databricks_access_audit/elevate.py +390 -0
databricks_access_audit/escalation.py +79 -0
databricks_access_audit/group_resolver.py +272 -0
databricks_access_audit/local_groups.py +200 -0
databricks_access_audit/models.py +342 -0
databricks_access_audit/principal_auditor.py +630 -0
databricks_access_audit/redundancy.py +104 -0
databricks_access_audit/revoke.py +75 -0
databricks_access_audit/schema_scanner.py +70 -0
databricks_access_audit/sdk_client.py +446 -0
databricks_access_audit/snapshot.py +266 -0
databricks_access_audit/stale_checker.py +326 -0
databricks_access_audit/table_scanner.py +74 -0
databricks_access_audit/workspace.py +144 -0
databricks_access_audit/workspace_object_scanner.py +616 -0
databricks_access_audit-0.18.1.dist-info/METADATA +889 -0
databricks_access_audit-0.18.1.dist-info/RECORD +28 -0
databricks_access_audit-0.18.1.dist-info/WHEEL +5 -0
databricks_access_audit-0.18.1.dist-info/entry_points.txt +2 -0
databricks_access_audit-0.18.1.dist-info/licenses/LICENSE +190 -0
databricks_access_audit-0.18.1.dist-info/top_level.txt +1 -0

databricks_access_audit/__init__.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Databricks Access Audit.
+Audit Databricks access across all workspaces — Unity Catalog permissions and workspace object ACLs.
+Quick start::
+    from databricks_access_audit import create_client, GroupMembershipResolver
+    client = create_client(cloud="azure", client_id="...",
+                           client_secret="...", account_id="...")
+    resolver = GroupMembershipResolver(client)
+    node = resolver.resolve_group("data-engineers")
+"""
+__version__ = "0.18.1"
+from databricks_access_audit._classification import build_member_lookups, classify_grant
+from databricks_access_audit.catalog_scanner import CatalogPermissionScanner, classify_catalog_grant
+from databricks_access_audit.client import (
+    AuditClient,
+    DatabricksAPIClient,
+    create_client,
+)
+from databricks_access_audit.csv_output import write_group_audit_csv, write_principal_audit_csv
+from databricks_access_audit.elevate import PermissionElevator
+from databricks_access_audit.escalation import ESCALATION_PRIVILEGES, detect_escalations
+from databricks_access_audit.group_resolver import GroupMembershipResolver
+from databricks_access_audit.local_groups import LocalGroupChecker
+from databricks_access_audit.models import (
+    AuditDiff,
+    CatalogGrant,
+    EffectivePermission,
+    # Feature models
+    EscalationFinding,
+    GrantSource,
+    GroupMember,
+    # Principal audit models
+    GroupMembership,
+    GroupNode,
+    LocalGroupFinding,
+    MemberType,
+    PrincipalAuditResult,
+    PrincipalSource,
+    RedundancyLevel,
+    RedundancyResult,
+    SchemaGrant,
+    StaleFinding,
+    TableGrant,
+    WorkspaceInfo,
+    WorkspaceObjectGrant,
+    WorkspaceRole,
+)
+from databricks_access_audit.principal_auditor import PrincipalAuditor
+from databricks_access_audit.redundancy import RedundancyDetector
+from databricks_access_audit.revoke import RevokeScriptGenerator
+from databricks_access_audit.schema_scanner import SchemaPermissionScanner
+from databricks_access_audit.snapshot import (
+    build_group_snapshot,
+    build_principal_snapshot,
+    diff_snapshots,
+    load_snapshot,
+    save_snapshot,
+)
+from databricks_access_audit.stale_checker import StaleGrantChecker
+from databricks_access_audit.table_scanner import TablePermissionScanner
+from databricks_access_audit.workspace import WORKSPACE_DOMAIN_MAP, WorkspaceDiscovery
+from databricks_access_audit.workspace_object_scanner import (
+    ALL_OBJECT_TYPES,
+    WorkspaceObjectScanner,
+)
+# Optional SDK client — only available when databricks-sdk is installed
+try:
+    from databricks_access_audit.sdk_client import SDK_AVAILABLE, DatabricksSDKClient
+except ImportError:
+    DatabricksSDKClient = None  # type: ignore[assignment,misc]
+    SDK_AVAILABLE = False
+__all__ = [
+    "PrincipalSource",
+    # Clients
+    "AuditClient",
+    "DatabricksAPIClient",
+    "DatabricksSDKClient",
+    "create_client",
+    "SDK_AVAILABLE",
+    # Core modules
+    "GroupMembershipResolver",
+    "WorkspaceDiscovery",
+    "CatalogPermissionScanner",
+    "SchemaPermissionScanner",
+    "TablePermissionScanner",
+    "WorkspaceObjectScanner",
+    "ALL_OBJECT_TYPES",
+    "RedundancyDetector",
+    "RevokeScriptGenerator",
+    "PrincipalAuditor",
+    "PermissionElevator",
+    "detect_escalations",
+    "ESCALATION_PRIVILEGES",
+    "StaleGrantChecker",
+    "LocalGroupChecker",
+    # Models
+    "MemberType",
+    "GroupMember",
+    "GroupNode",
+    "WorkspaceInfo",
+    "GrantSource",
+    "CatalogGrant",
+    "SchemaGrant",
+    "TableGrant",
+    "WorkspaceObjectGrant",
+    "RedundancyLevel",
+    "RedundancyResult",
+    "GroupMembership",
+    "WorkspaceRole",
+    "EffectivePermission",
+    "PrincipalAuditResult",
+    "EscalationFinding",
+    "StaleFinding",
+    "LocalGroupFinding",
+    "AuditDiff",
+    # CSV / snapshot
+    "write_group_audit_csv",
+    "write_principal_audit_csv",
+    "build_group_snapshot",
+    "build_principal_snapshot",
+    "save_snapshot",
+    "load_snapshot",
+    "diff_snapshots",
+    # Helpers
+    "WORKSPACE_DOMAIN_MAP",
+    "classify_catalog_grant",
+    "classify_grant",
+    "build_member_lookups",
+]

databricks_access_audit/__main__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Allow running as `python -m databricks_access_audit`."""
+import sys
+from databricks_access_audit.cli import main
+sys.exit(main())

databricks_access_audit/_classification.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Shared grant classification helpers used by all permission scanners."""
+from __future__ import annotations
+from typing import Dict, List, Optional, Set, Tuple
+from databricks_access_audit.models import GrantSource, GroupMember
+def build_member_lookups(
+    all_members: Dict[str, List[GroupMember]],
+) -> Tuple[Set[str], Set[str], Set[str], Set[str]]:
+    """Build lookup sets from a flat members dict for grant classification.
+    Returns ``(member_emails, member_display_names, sp_display_names, sp_app_ids)``.
+    Values are stored in their original case; :func:`classify_grant` performs
+    case-insensitive comparison where needed.
+    """
+    member_emails: Set[str] = set()
+    member_names: Set[str] = set()
+    sp_names: Set[str] = set()
+    sp_app_ids: Set[str] = set()
+    for u in all_members.get("users", []):
+        if u.email:
+            member_emails.add(u.email)
+        if u.display_name:
+            member_names.add(u.display_name)
+    for sp in all_members.get("service_principals", []):
+        if sp.display_name:
+            sp_names.add(sp.display_name)
+        if sp.application_id:
+            sp_app_ids.add(sp.application_id)
+    return member_emails, member_names, sp_names, sp_app_ids
+def classify_grant(
+    principal: str,
+    target_group_name: str,
+    upstream_groups: Dict[str, str],
+    member_emails: Set[str],
+    member_names: Set[str],
+    sp_names: Set[str],
+    sp_app_ids: Set[str],
+) -> Optional[Tuple[GrantSource, str, Optional[str], bool]]:
+    """Classify a single UC grant relative to the target group.
+    Returns ``(source, principal_type, inherited_from, member_of_target)`` or
+    ``None`` when the principal is completely unrelated to the target group.
+    Handles several real-world Databricks quirks:
+    * Backtick-quoted principal names that the API occasionally returns.
+    * Case-insensitive email matching — Azure AD normalises email casing
+      differently across APIs, so ``alice@corp.com`` and ``Alice@Corp.com``
+      should be treated as the same identity.
+    * Display-name grants — some setups grant privileges to a user's
+      display name rather than their email address.
+    * Service principals identified by either display name or application ID.
+    """
+    if not principal or not principal.strip():
+        return None
+    # Strip backtick quoting Databricks sometimes wraps around principal names
+    clean = principal.replace("`", "").strip()
+    # ------------------------------------------------------------------ #
+    # 1. Direct — the target group itself holds this privilege             #
+    # ------------------------------------------------------------------ #
+    if clean == target_group_name or principal == target_group_name:
+        return GrantSource.DIRECT, "GROUP", None, False
+    # ------------------------------------------------------------------ #
+    # 2. Upstream — a parent / ancestor group of the target holds it       #
+    # ------------------------------------------------------------------ #
+    if principal in upstream_groups or clean in upstream_groups:
+        resolved = principal if principal in upstream_groups else clean
+        return GrantSource.UPSTREAM, "GROUP", resolved, False
+    clean_lower = clean.lower()
+    # ------------------------------------------------------------------ #
+    # 3. Member — user identified by email (case-insensitive)              #
+    # ------------------------------------------------------------------ #
+    if clean in member_emails or any(e.lower() == clean_lower for e in member_emails):
+        return GrantSource.MEMBER_DIRECT, "USER", None, True
+    # ------------------------------------------------------------------ #
+    # 4. Member — user identified by display name                          #
+    #    Some Databricks setups (especially with AAD sync) grant using     #
+    #    the display name rather than the email address.                   #
+    # ------------------------------------------------------------------ #
+    if clean in member_names or any(n.lower() == clean_lower for n in member_names):
+        return GrantSource.MEMBER_DIRECT, "USER", None, True
+    # ------------------------------------------------------------------ #
+    # 5. Member — service principal identified by display name             #
+    # ------------------------------------------------------------------ #
+    if clean in sp_names or any(n.lower() == clean_lower for n in sp_names):
+        return GrantSource.MEMBER_DIRECT, "SERVICE_PRINCIPAL", None, True
+    # ------------------------------------------------------------------ #
+    # 6. Member — service principal identified by application / client ID  #
+    #    The application ID is case-sensitive (it's a GUID).               #
+    # ------------------------------------------------------------------ #
+    if clean in sp_app_ids:
+        return GrantSource.MEMBER_DIRECT, "SERVICE_PRINCIPAL", None, True
+    return None

databricks_access_audit/catalog_scanner.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""Cross-workspace catalog permission scanner."""
+from __future__ import annotations
+import logging
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional, Set
+from databricks_access_audit._classification import build_member_lookups, classify_grant
+from databricks_access_audit.client import AuditClient
+from databricks_access_audit.group_resolver import GroupMembershipResolver
+from databricks_access_audit.models import (
+    CatalogGrant,
+    GroupMember,
+    GroupNode,
+    WorkspaceInfo,
+)
+log = logging.getLogger(__name__)
+def classify_catalog_grant(
+    principal: str,
+    privileges: List[str],
+    catalog_name: str,
+    workspace: WorkspaceInfo,
+    target_group_name: str,
+    upstream_groups: Dict[str, str],
+    member_emails: set,
+    member_names: set,
+    sp_names: set,
+    sp_app_ids: set,
+) -> Optional[CatalogGrant]:
+    """Classify a grant as Direct / Upstream / Member Direct and wrap in CatalogGrant."""
+    result = classify_grant(
+        principal, target_group_name, upstream_groups,
+        member_emails, member_names, sp_names, sp_app_ids,
+    )
+    if result is None:
+        return None
+    source, ptype, inherited, member = result
+    return CatalogGrant(
+        catalog_name=catalog_name,
+        workspace_name=workspace.workspace_name,
+        workspace_url=workspace.workspace_url,
+        principal=principal,
+        principal_type=ptype,
+        privileges=privileges,
+        grant_source=source,
+        inherited_from=inherited,
+        member_of_target=member,
+    )
+class CatalogPermissionScanner:
+    """Scan catalog permissions across workspaces.
+    Each workspace is scanned independently so that workspace-catalog bindings
+    are respected: the same catalog name can be attached to different subsets
+    of workspaces, and must be scanned from every workspace that can see it.
+    Duplicate workspace URLs in the input list are silently deduplicated by
+    :meth:`scan_all_workspaces` before dispatch.  Within a single
+    :meth:`scan_workspace` call, duplicate catalog names from the UC API
+    response are skipped via a local seen-set.
+    """
+    def __init__(self, api_client: AuditClient, group_resolver: GroupMembershipResolver):
+        self.api_client = api_client
+        self.group_resolver = group_resolver
+    def _get_catalogs(self, workspace: WorkspaceInfo) -> List[dict]:
+        try:
+            return self.api_client.workspace_api(
+                workspace.workspace_url, "GET", "/api/2.1/unity-catalog/catalogs"
+            ).get("catalogs", [])
+        except Exception as exc:
+            log.warning(
+                "Failed to list catalogs for workspace %s: %s", workspace.workspace_name, exc
+            )
+            return []
+    def _get_catalog_grants(self, workspace: WorkspaceInfo, catalog_name: str) -> List[dict]:
+        try:
+            resp = self.api_client.workspace_api(
+                workspace.workspace_url, "GET",
+                f"/api/2.1/unity-catalog/permissions/catalog/{catalog_name}",
+            )
+            return resp.get("privilege_assignments") or []
+        except Exception as exc:
+            log.warning(
+                "Failed to get grants for catalog %s on workspace %s: %s",
+                catalog_name, workspace.workspace_name, exc,
+            )
+            return []
+    def get_groups_containing_target(self, target_group_name: str) -> Dict[str, str]:
+        """Find ALL upstream (ancestor) groups of the target via BFS.
+        Delegates the O(N) group-membership fetch to
+        :meth:`~databricks_access_audit.group_resolver.GroupMembershipResolver.get_group_membership_map`,
+        which parallelises the individual GETs and caches the result for the
+        lifetime of the resolver instance.  Multiple callers within the same
+        audit session (catalog scanner, schema scanner, principal auditor) share
+        the cached map without redundant API calls.
+        """
+        id_to_name, _, child_to_parents = self.group_resolver.get_group_membership_map()
+        target_id = next(
+            (gid for gid, name in id_to_name.items() if name == target_group_name), None
+        )
+        if not target_id:
+            return {}
+        upstream: Dict[str, str] = {}
+        queue: deque = deque([target_id])
+        visited = {target_id}
+        while queue:
+            current = queue.popleft()
+            for parent_id in child_to_parents.get(current, set()):
+                if parent_id not in visited:
+                    visited.add(parent_id)
+                    upstream[id_to_name.get(parent_id, parent_id)] = parent_id
+                    queue.append(parent_id)
+        return upstream
+    def scan_workspace(
+        self,
+        workspace: WorkspaceInfo,
+        target_group_name: str,
+        group_node: GroupNode,
+        all_members: Dict[str, List[GroupMember]],
+        upstream_groups: Optional[Dict[str, str]] = None,
+    ) -> List[CatalogGrant]:
+        """Scan a single workspace for catalog grants related to target_group_name.
+        Parameters
+        ----------
+        upstream_groups:
+            Pre-computed ancestor group map from :meth:`get_groups_containing_target`.
+            When *None* the map is computed on demand (adds one SCIM list call).
+            Pass it explicitly when scanning multiple workspaces to avoid N+1 fetches.
+        """
+        if upstream_groups is None:
+            upstream_groups = self.get_groups_containing_target(target_group_name)
+        grants: List[CatalogGrant] = []
+        catalogs = self._get_catalogs(workspace)
+        lookups = build_member_lookups(all_members)
+        seen_names: Set[str] = set()  # guard against duplicate catalog names in the API response
+        for cat in catalogs:
+            name = cat.get("name", "")
+            if not name or name in seen_names:
+                continue
+            seen_names.add(name)
+            for g in self._get_catalog_grants(workspace, name):
+                privs = g.get("privileges") or []
+                if not privs:
+                    continue
+                obj = classify_catalog_grant(
+                    g.get("principal", ""), privs, name, workspace,
+                    target_group_name, upstream_groups, *lookups,
+                )
+                if obj:
+                    grants.append(obj)
+        return grants
+    def scan_all_workspaces(
+        self,
+        workspaces: List[WorkspaceInfo],
+        target_group_name: str,
+        group_node: GroupNode,
+        all_members: Dict[str, List[GroupMember]],
+        max_workers: int = 8,
+    ) -> List[CatalogGrant]:
+        """Scan all workspaces in parallel, computing upstream groups exactly once.
+        Duplicate workspace URLs are silently deduplicated before dispatch so
+        that a workspace listed more than once is only scanned once.
+        Workers are capped at the number of unique workspaces to avoid
+        spawning idle threads.
+        """
+        # Fetch upstream groups once — the SCIM hierarchy is account-level and
+        # does not change between workspaces, so fetching N times would be wasteful.
+        upstream_groups = self.get_groups_containing_target(target_group_name)
+        # Deduplicate by URL while preserving order.
+        seen_urls: Set[str] = set()
+        unique_workspaces: List[WorkspaceInfo] = []
+        for ws in workspaces:
+            if ws.workspace_url not in seen_urls:
+                seen_urls.add(ws.workspace_url)
+                unique_workspaces.append(ws)
+        n = len(unique_workspaces)
+        if n == 0:
+            return []
+        all_grants: List[CatalogGrant] = []
+        workers = min(max_workers, n)
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {
+                pool.submit(
+                    self.scan_workspace,
+                    ws, target_group_name, group_node, all_members, upstream_groups,
+                ): ws
+                for ws in unique_workspaces
+            }
+            for fut in as_completed(futures):
+                ws = futures[fut]
+                try:
+                    all_grants.extend(fut.result())
+                except Exception as exc:
+                    log.warning("Skipping workspace %s due to error: %s", ws.workspace_name, exc)
+        return all_grants