PyPI - datahub-agent-context - Versions diffs - 1.3.1.8__py3-none-any.whl - Mend

datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

datahub_agent_context/__init__.py +25 -0
datahub_agent_context/_version.py +16 -0
datahub_agent_context/context.py +97 -0
datahub_agent_context/langchain_tools/__init__.py +8 -0
datahub_agent_context/langchain_tools/builder.py +127 -0
datahub_agent_context/mcp_tools/__init__.py +46 -0
datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
datahub_agent_context/mcp_tools/base.py +325 -0
datahub_agent_context/mcp_tools/descriptions.py +299 -0
datahub_agent_context/mcp_tools/documents.py +473 -0
datahub_agent_context/mcp_tools/domains.py +246 -0
datahub_agent_context/mcp_tools/entities.py +349 -0
datahub_agent_context/mcp_tools/get_me.py +99 -0
datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
datahub_agent_context/mcp_tools/gql/search.gql +242 -0
datahub_agent_context/mcp_tools/helpers.py +448 -0
datahub_agent_context/mcp_tools/lineage.py +698 -0
datahub_agent_context/mcp_tools/owners.py +318 -0
datahub_agent_context/mcp_tools/queries.py +191 -0
datahub_agent_context/mcp_tools/search.py +239 -0
datahub_agent_context/mcp_tools/structured_properties.py +447 -0
datahub_agent_context/mcp_tools/tags.py +296 -0
datahub_agent_context/mcp_tools/terms.py +295 -0
datahub_agent_context/py.typed +2 -0
datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0

datahub_agent_context/mcp_tools/domains.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""Domain management tools for DataHub MCP server."""
+import logging
+from typing import List
+from datahub_agent_context.context import get_graph
+from datahub_agent_context.mcp_tools.base import execute_graphql
+logger = logging.getLogger(__name__)
+def _validate_domain_urn(domain_urn: str) -> None:
+    """
+    Validate that the domain URN exists in DataHub.
+    Raises:
+        ValueError: If the domain URN does not exist or is invalid
+    """
+    graph = get_graph()
+    query = """
+        query getDomain($urn: String!) {
+            entity(urn: $urn) {
+                urn
+                type
+                ... on Domain {
+                    properties {
+                        name
+                    }
+                }
+            }
+        }
+    """
+    try:
+        result = execute_graphql(
+            graph,
+            query=query,
+            variables={"urn": domain_urn},
+            operation_name="getDomain",
+        )
+        entity = result.get("entity")
+        if entity is None:
+            raise ValueError(
+                f"Domain URN does not exist in DataHub: {domain_urn}. "
+                f"Please use the search tool with entity_type filter to find existing domains, "
+                f"or create the domain first before assigning it."
+            )
+        if entity.get("type") != "DOMAIN":
+            raise ValueError(
+                f"The URN is not a domain entity: {domain_urn} (type: {entity.get('type')})"
+            )
+    except Exception as e:
+        if isinstance(e, ValueError):
+            raise
+        raise ValueError(f"Failed to validate domain URN: {str(e)}") from e
+def set_domains(
+    domain_urn: str,
+    entity_urns: List[str],
+) -> dict:
+    """Set domain for multiple DataHub entities.
+    This tool allows you to assign a domain to multiple entities in a single operation.
+    Useful for organizing datasets, dashboards, and other entities into logical business domains.
+    Note: Domain assignment in DataHub is entity-level only. Each entity can belong to exactly one domain.
+    Setting a new domain will replace any existing domain assignment.
+    Args:
+        domain_urn: Domain URN to assign (e.g., "urn:li:domain:marketing")
+        entity_urns: List of entity URNs to assign to the domain (e.g., dataset URNs, dashboard URNs)
+    Returns:
+        Dictionary with:
+        - success: Boolean indicating if the operation succeeded
+        - message: Success or error message
+    Examples:
+        # Set domain for multiple datasets
+        set_domains(
+            domain_urn="urn:li:domain:marketing",
+            entity_urns=[
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.campaigns,PROD)",
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
+            ]
+        )
+        # Set domain for dashboards
+        set_domains(
+            domain_urn="urn:li:domain:finance",
+            entity_urns=[
+                "urn:li:dashboard:(urn:li:dataPlatform:looker,revenue_dashboard,PROD)",
+                "urn:li:dashboard:(urn:li:dataPlatform:looker,expense_dashboard,PROD)"
+            ]
+        )
+        # Set domain for mixed entity types
+        set_domains(
+            domain_urn="urn:li:domain:engineering",
+            entity_urns=[
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.logs,PROD)",
+                "urn:li:dataFlow:(urn:li:dataPlatform:airflow,etl_pipeline,PROD)",
+                "urn:li:dashboard:(urn:li:dataPlatform:superset,metrics,PROD)"
+            ]
+        )
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = set_domains(
+                domain_urn="urn:li:domain:marketing",
+                entity_urns=["urn:li:dataset:(...)"]
+            )
+    """
+    graph = get_graph()
+    if not domain_urn:
+        raise ValueError("domain_urn cannot be empty")
+    if not entity_urns:
+        raise ValueError("entity_urns cannot be empty")
+    _validate_domain_urn(domain_urn)
+    resources = []
+    for resource_urn in entity_urns:
+        resources.append({"resourceUrn": resource_urn})
+    mutation = """
+        mutation batchSetDomain($input: BatchSetDomainInput!) {
+            batchSetDomain(input: $input)
+        }
+    """
+    variables = {"input": {"domainUrn": domain_urn, "resources": resources}}
+    try:
+        result = execute_graphql(
+            graph,
+            query=mutation,
+            variables=variables,
+            operation_name="batchSetDomain",
+        )
+        if result.get("batchSetDomain", False):
+            return {
+                "success": True,
+                "message": f"Successfully set domain for {len(entity_urns)} entit(ies)",
+            }
+        else:
+            raise RuntimeError("Failed to set domain - operation returned false")
+    except Exception as e:
+        if isinstance(e, RuntimeError):
+            raise
+        raise RuntimeError(f"Error setting domain: {str(e)}") from e
+def remove_domains(
+    entity_urns: List[str],
+) -> dict:
+    """Remove domain assignment from multiple DataHub entities.
+    This tool allows you to unset the domain for multiple entities in a single operation.
+    Useful for removing domain assignments when reorganizing entities or correcting misassignments.
+    Args:
+        entity_urns: List of entity URNs to remove domain from (e.g., dataset URNs, dashboard URNs)
+    Returns:
+        Dictionary with:
+        - success: Boolean indicating if the operation succeeded
+        - message: Success or error message
+    Examples:
+        # Remove domain from multiple datasets
+        remove_domains(
+            entity_urns=[
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.old_table,PROD)",
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.deprecated,PROD)"
+            ]
+        )
+        # Remove domain from dashboards
+        remove_domains(
+            entity_urns=[
+                "urn:li:dashboard:(urn:li:dataPlatform:looker,old_dashboard,PROD)",
+                "urn:li:dashboard:(urn:li:dataPlatform:looker,temp_dashboard,PROD)"
+            ]
+        )
+        # Remove domain from mixed entity types
+        remove_domains(
+            entity_urns=[
+                "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.temp,PROD)",
+                "urn:li:dataFlow:(urn:li:dataPlatform:airflow,old_pipeline,PROD)",
+                "urn:li:dashboard:(urn:li:dataPlatform:superset,test,PROD)"
+            ]
+        )
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = remove_domains(entity_urns=["urn:li:dataset:(...)"])
+    """
+    graph = get_graph()
+    if not entity_urns:
+        raise ValueError("entity_urns cannot be empty")
+    resources = []
+    for resource_urn in entity_urns:
+        resources.append({"resourceUrn": resource_urn})
+    mutation = """
+        mutation batchSetDomain($input: BatchSetDomainInput!) {
+            batchSetDomain(input: $input)
+        }
+    """
+    variables = {"input": {"domainUrn": None, "resources": resources}}
+    try:
+        result = execute_graphql(
+            graph,
+            query=mutation,
+            variables=variables,
+            operation_name="batchSetDomain",
+        )
+        if result.get("batchSetDomain", False):
+            return {
+                "success": True,
+                "message": f"Successfully removed domain from {len(entity_urns)} entit(ies)",
+            }
+        else:
+            raise RuntimeError("Failed to remove domain - operation returned false")
+    except Exception as e:
+        if isinstance(e, RuntimeError):
+            raise
+        raise RuntimeError(f"Error removing domain: {str(e)}") from e

datahub_agent_context/mcp_tools/entities.py ADDED Viewed

@@ -0,0 +1,349 @@
+"""Tools for getting entity information."""
+import json
+import logging
+import pathlib
+from typing import Iterator, List, Optional
+from json_repair import repair_json
+from datahub.errors import ItemNotFoundError
+from datahub_agent_context.context import get_graph
+from datahub_agent_context.mcp_tools.base import execute_graphql
+from datahub_agent_context.mcp_tools.helpers import (
+    clean_get_entities_response,
+    inject_urls_for_urns,
+    truncate_descriptions,
+)
+logger = logging.getLogger(__name__)
+# Load GraphQL queries
+entity_details_fragment_gql = (
+    pathlib.Path(__file__).parent / "gql/entity_details.gql"
+).read_text()
+query_entity_gql = (pathlib.Path(__file__).parent / "gql/query_entity.gql").read_text()
+def get_entities(urns: List[str] | str) -> List[dict] | dict:
+    """Get detailed information about one or more entities by their DataHub URNs.
+    IMPORTANT: Pass an array of URNs to retrieve multiple entities in a single call - this is much
+    more efficient than calling this tool multiple times. When examining search results, always pass
+    an array with the top 3-10 result URNs to compare and find the best match.
+    Accepts an array of URNs or a single URN. Supports all entity types including datasets,
+    assertions, incidents, dashboards, charts, users, groups, and more. The response fields vary
+    based on the entity type.
+    Args:
+        urns: List of URNs or a single URN string
+    Returns:
+        Single dict if single URN provided, list of dicts if multiple URNs provided.
+        Each result contains entity details or error information.
+    Raises:
+        ItemNotFoundError: If single URN provided and entity not found
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = get_entities(urns=["urn:li:dataset:(...)"])
+    """
+    graph = get_graph()
+    # Handle JSON-stringified arrays
+    # Some MCP clients/LLMs pass arrays as JSON strings instead of proper lists
+    if isinstance(urns, str):
+        urns_str = urns.strip()  # Remove leading/trailing whitespace
+        # Try to parse as JSON array first
+        if urns_str.startswith("["):
+            try:
+                # Use json_repair to handle malformed JSON from LLMs
+                urns = json.loads(repair_json(urns_str))
+                return_single = False
+            except (json.JSONDecodeError, Exception) as e:
+                logger.warning(
+                    f"Failed to parse URNs as JSON array: {e}. Treating as single URN."
+                )
+                # Not valid JSON, treat as single URN string
+                urns = [urns_str]
+                return_single = True
+        else:
+            # Single URN string
+            urns = [urns_str]
+            return_single = True
+    else:
+        return_single = False
+    # Trim whitespace from each URN (defensive against string concatenation issues)
+    urns = [urn.strip() for urn in urns]
+    results = []
+    for urn in urns:
+        try:
+            # Check if entity exists first
+            if not graph.exists(urn):
+                logger.warning(f"Entity not found during existence check: {urn}")
+                if return_single:
+                    raise ItemNotFoundError(f"Entity {urn} not found")
+                results.append({"error": f"Entity {urn} not found", "urn": urn})
+                continue
+            # Special handling for Query entities (not part of Entity union type)
+            is_query = urn.startswith("urn:li:query:")
+            # Execute the appropriate GraphQL query
+            variables = {"urn": urn}
+            if is_query:
+                result = execute_graphql(
+                    graph,
+                    query=query_entity_gql,
+                    variables=variables,
+                    operation_name="GetQueryEntity",
+                )["entity"]
+            else:
+                result = execute_graphql(
+                    graph,
+                    query=entity_details_fragment_gql,
+                    variables=variables,
+                    operation_name="GetEntity",
+                )["entity"]
+            # Check if entity data was returned
+            if result is None:
+                raise ItemNotFoundError(
+                    f"Entity {urn} exists but no data could be retrieved. "
+                    f"This can happen if the entity has no aspects ingested yet, or if there's a permissions issue."
+                )
+            inject_urls_for_urns(graph, result, [""])
+            truncate_descriptions(result)
+            results.append(clean_get_entities_response(result))
+        except Exception as e:
+            logger.warning(f"Error fetching entity {urn}: {e}")
+            if return_single:
+                raise
+            results.append({"error": str(e), "urn": urn})
+    # Return single dict if single URN was passed, array otherwise
+    return results[0] if return_single else results
+def list_schema_fields(
+    urn: str,
+    keywords: Optional[List[str] | str] = None,
+    limit: int = 100,
+    offset: int = 0,
+) -> dict:
+    """List schema fields for a dataset, with optional keyword filtering and pagination.
+    Useful when schema fields were truncated in search results (schemaFieldsTruncated present)
+    and you need to explore specific columns. Supports pagination for large schemas.
+    Args:
+        urn: Dataset URN
+        keywords: Optional keywords to filter schema fields (OR matching).
+                 - Single string: Treated as one keyword (NOT split on whitespace). Use for field names or exact phrases.
+                 - List of strings: Multiple keywords, matches any (OR logic).
+                 - None or empty list: Returns all fields in priority order (same as get_entities).
+                 Matches against fieldPath, description, label, tags, and glossary terms.
+                 Matching fields are returned first, sorted by match count.
+        limit: Maximum number of fields to return (default: 100)
+        offset: Number of fields to skip for pagination (default: 0)
+    Returns:
+        Dictionary with:
+        - urn: The dataset URN
+        - fields: List of schema fields (paginated)
+        - totalFields: Total number of fields in the schema
+        - returned: Number of fields actually returned
+        - remainingCount: Number of fields not included after offset (accounts for limit and token budget)
+        - matchingCount: Number of fields that matched keywords (if keywords provided, None otherwise)
+        - offset: The offset used
+    Examples:
+        # Single keyword (string) - search for exact field name or phrase
+        list_schema_fields(urn="urn:li:dataset:(...)", keywords="user_email")
+        # Returns fields matching "user_email" (like user_email_address, primary_user_email)
+        # Multiple keywords (list) - OR matching
+        list_schema_fields(urn="urn:li:dataset:(...)", keywords=["email", "user"])
+        # Returns fields containing "email" OR "user" (user_email, contact_email, user_id, etc.)
+        # Pagination through all fields
+        list_schema_fields(urn="urn:li:dataset:(...)", limit=100, offset=0)   # First 100
+        list_schema_fields(urn="urn:li:dataset:(...)", limit=100, offset=100) # Next 100
+        # Combine filtering + pagination
+        list_schema_fields(urn="urn:li:dataset:(...)", keywords=["user"], limit=50, offset=0)
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = list_schema_fields(urn="urn:li:dataset:(...)", keywords="email")
+    Raises:
+        ItemNotFoundError: If entity not found
+    """
+    graph = get_graph()
+    # Normalize keywords to list (None means no filtering)
+    keywords_lower = None
+    if keywords is not None:
+        if isinstance(keywords, str):
+            keywords = [keywords]
+        keywords_lower = [kw.lower() for kw in keywords]
+    # Fetch entity
+    if not graph.exists(urn):
+        raise ItemNotFoundError(f"Entity {urn} not found")
+    # Execute GraphQL query to get full schema
+    variables = {"urn": urn}
+    result = execute_graphql(
+        graph,
+        query=entity_details_fragment_gql,
+        variables=variables,
+        operation_name="GetEntity",
+    )["entity"]
+    # Check if entity data was returned
+    if result is None:
+        raise ItemNotFoundError(
+            f"Entity {urn} exists but no data could be retrieved. "
+            f"This can happen if the entity has no aspects ingested yet, or if there's a permissions issue."
+        )
+    # Apply same preprocessing as get_entities
+    inject_urls_for_urns(graph, result, [""])
+    truncate_descriptions(result)
+    # Extract total field count before processing
+    total_fields = len(result.get("schemaMetadata", {}).get("fields", []))
+    if total_fields == 0:
+        return {
+            "urn": urn,
+            "fields": [],
+            "totalFields": 0,
+            "returned": 0,
+            "remainingCount": 0,
+            "matchingCount": None,
+            "offset": offset,
+        }
+    # Define custom sorting function for keyword matching
+    sort_fn = None
+    matching_count = None
+    if keywords_lower:
+        # Helper function to score a field by keyword matches
+        def score_field_by_keywords(field: dict) -> int:
+            """Score a field by counting keyword match coverage across its metadata.
+            Scoring logic (OR matching):
+            - Each keyword gets +1 if it appears in ANY searchable text (substring match)
+            - Multiple occurrences of the same keyword in one text still count as +1
+            - Higher score = more aspects of the field match the keywords
+            Searchable texts (in order of priority):
+            1. fieldPath (column name)
+            2. description
+            3. label
+            4. tag names
+            5. glossary term names
+            Example:
+                keywords = ["email", "user"]
+                field = {
+                    "fieldPath": "user_email",        # matches both
+                    "description": "User's email",    # matches both
+                    "tags": ["PII"]                   # matches neither
+                }
+                Score = 4 (email in fieldPath + email in desc + user in fieldPath + user in desc)
+            Returns:
+                Integer score (0 = no matches, higher = more coverage)
+            """
+            searchable_texts = [
+                field.get("fieldPath", ""),
+                field.get("description", ""),
+                field.get("label", ""),
+            ]
+            # Add tag names
+            if tags := field.get("tags"):
+                if tag_list := tags.get("tags"):
+                    searchable_texts.extend(
+                        [
+                            (t.get("tag", {}).get("properties") or {}).get("name", "")
+                            for t in tag_list
+                        ]
+                    )
+            # Add glossary term names
+            if glossary_terms := field.get("glossaryTerms"):
+                if terms_list := glossary_terms.get("terms"):
+                    searchable_texts.extend(
+                        [
+                            (t.get("term", {}).get("properties") or {}).get("name", "")
+                            for t in terms_list
+                        ]
+                    )
+            # Count keyword coverage: +1 for each (keyword, text) pair that matches
+            # Note: Substring matching, case-insensitive
+            return sum(
+                1
+                for kw in keywords_lower
+                for text in searchable_texts
+                if text and kw in text.lower()
+            )
+        # Pre-compute matching count (need all fields for this)
+        fields_for_counting = result.get("schemaMetadata", {}).get("fields", [])
+        matching_count = sum(
+            1 for field in fields_for_counting if score_field_by_keywords(field) > 0
+        )
+        # Define sort function for clean_get_entities_response
+        def sort_by_keyword_match(fields: List[dict]) -> Iterator[dict]:
+            """Sort fields by keyword match count (descending), then alphabetically."""
+            scored_fields = [
+                (score_field_by_keywords(field), field) for field in fields
+            ]
+            scored_fields.sort(key=lambda x: (-x[0], x[1].get("fieldPath", "")))
+            return iter(field for _, field in scored_fields)
+        sort_fn = sort_by_keyword_match
+    # Use clean_get_entities_response for consistent processing
+    cleaned_entity = clean_get_entities_response(
+        result,
+        sort_fn=sort_fn,
+        offset=offset,
+        limit=limit,
+    )
+    # Extract the cleaned fields and metadata
+    schema_metadata = cleaned_entity.get("schemaMetadata", {})
+    cleaned_fields = schema_metadata.get("fields", [])
+    # Calculate how many fields remain after what we returned
+    # This accounts for both pagination and token budget constraints
+    remaining_count = total_fields - offset - len(cleaned_fields)
+    return {
+        "urn": urn,
+        "fields": cleaned_fields,
+        "totalFields": total_fields,
+        "returned": len(cleaned_fields),
+        "remainingCount": remaining_count,
+        "matchingCount": matching_count,
+        "offset": offset,
+    }

datahub_agent_context/mcp_tools/get_me.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Get authenticated user information tool for DataHub MCP server."""
+import logging
+from typing import Any
+from datahub_agent_context.context import get_graph
+from datahub_agent_context.mcp_tools.base import execute_graphql
+logger = logging.getLogger(__name__)
+def get_me() -> dict[str, Any]:
+    """Get information about the currently authenticated user.
+    This tool fetches detailed information about the authenticated user including:
+    - User profile information (username, email, full name, etc.)
+    - Platform privileges (what the user can do in DataHub)
+    - Group memberships
+    - User settings and preferences
+    Returns:
+        Dictionary with:
+        - success: Boolean indicating if the operation succeeded
+        - data: User information including corpUser and platformPrivileges
+        - message: Success or error message
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = get_me()
+    """
+    graph = get_graph()
+    # GraphQL query to get authenticated user information
+    query = """
+        query getMe {
+            me {
+                corpUser {
+                    type
+                    urn
+                    username
+                    info {
+                        active
+                        displayName
+                        title
+                        firstName
+                        lastName
+                        fullName
+                        email
+                    }
+                    editableProperties {
+                        displayName
+                        title
+                        pictureLink
+                        teams
+                        skills
+                    }
+                    groups: relationships(
+                        input: { types: ["IsMemberOfGroup", "IsMemberOfNativeGroup"], direction: OUTGOING, start: 0, count: 50 }
+                    ) {
+                        relationships {
+                            entity {
+                                ... on CorpGroup {
+                                    urn
+                                    name
+                                    properties {
+                                        displayName
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    """
+    try:
+        result = execute_graphql(
+            graph,
+            query=query,
+            variables={},
+            operation_name="getMe",
+        )
+        me_data = result.get("me")
+        if me_data:
+            return {
+                "success": True,
+                "data": me_data,
+                "message": "Successfully retrieved authenticated user information",
+            }
+        else:
+            raise RuntimeError("No authenticated user found")
+    except Exception as e:
+        if isinstance(e, RuntimeError):
+            raise
+        raise RuntimeError(f"Error retrieving user information: {str(e)}") from e